diff --git a/.bcachefs_revision b/.bcachefs_revision index 51df9f0e..a8916efb 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9abf628c701ad92670d697624f674cc01d42705e +2cb70a82bc0ca05d8c3cf666d221badd5724e339 diff --git a/include/linux/slab.h b/include/linux/slab.h index 9229e750..c19f190b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -112,4 +112,14 @@ static inline void *vmap(struct page **pages, unsigned int count, #define vmalloc_to_page(addr) ((struct page *) (addr)) +static inline void *kmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kmalloc(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} + #endif /* __TOOLS_LINUX_SLAB_H */ diff --git a/libbcachefs.c b/libbcachefs.c index 3278645b..49790d89 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -519,6 +519,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f, { } +static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f, + enum units units) +{ +} + typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units); struct bch_sb_field_toolops { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 4702b016..1482b80a 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -259,6 +259,10 @@ do { \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ "done in memory") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -314,7 +318,13 @@ enum bch_time_stats { struct btree; enum gc_phase { - GC_PHASE_SB = BTREE_ID_NR + 1, + GC_PHASE_START, + GC_PHASE_SB, + +#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd, + DEFINE_BCH_BTREE_IDS() +#undef DEF_BTREE_ID + GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, GC_PHASE_DONE diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index ab8b9446..b6e7b983 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -426,6 +426,16 @@ enum bch_csum_type { BCH_CSUM_NR = 7, }; +static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_NONE] = 0, + [BCH_CSUM_CRC32C_NONZERO] = 4, + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64_NONZERO] = 8, + [BCH_CSUM_CRC64] = 8, + [BCH_CSUM_CHACHA20_POLY1305_80] = 10, + [BCH_CSUM_CHACHA20_POLY1305_128] = 16, +}; + static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) { switch (type) { @@ -783,6 +793,11 @@ struct bch_dirent { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(dirent, BCH_DIRENT); +#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ + sizeof(struct bkey) - \ + offsetof(struct bch_dirent, d_name)) + + /* Xattrs */ enum { @@ -868,7 +883,8 @@ struct bch_sb_field { x(crypt, 2) \ x(replicas, 3) \ x(quota, 4) \ - x(disk_groups, 5) + x(disk_groups, 5) \ + x(clean, 6) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1038,6 +1054,37 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[0]; }; +/* + * On clean shutdown, store btree roots and current journal sequence number in + * the superblock: + */ +struct jset_entry { + __le16 u64s; + __u8 btree_id; + __u8 level; + __u8 type; /* designates what this jset holds */ + __u8 pad[3]; + + union { + struct bkey_i start[0]; + __u64 _data[0]; + }; +}; + +struct bch_sb_field_clean { + struct bch_sb_field field; + + __le32 flags; + __le16 read_clock; + __le16 write_clock; + __le64 journal_seq; + + union { + struct jset_entry start[0]; + __u64 _data[0]; + }; +}; + /* Superblock: */ /* @@ -1255,19 +1302,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb) #define BCACHE_JSET_VERSION_JKEYS 2 #define BCACHE_JSET_VERSION 2 -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __u8 type; /* designates what this jset holds */ - __u8 pad[3]; - - union { - struct bkey_i start[0]; - __u64 _data[0]; - }; -}; - #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) #define BCH_JSET_ENTRY_TYPES() \ diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index c950f256..b0dc4c8a 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -649,7 +649,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, struct btree *b; struct bset_tree *t; - /* btree_node_fill() requires parent to be locked: */ + /* + * XXX: locking optimization + * + * we can make the locking looser here - caller can drop lock on parent + * node before locking child node (and potentially blocking): we just + * have to have bch2_btree_node_fill() call relock on the parent and + * return -EINTR if that fails + */ EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); retry: @@ -749,23 +756,22 @@ retry: struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree_iter *iter, struct btree *b, + bool may_drop_locks, enum btree_node_sibling sib) { struct btree *parent; struct btree_node_iter node_iter; struct bkey_packed *k; BKEY_PADDED(k) tmp; - struct btree *ret; + struct btree *ret = NULL; unsigned level = b->level; parent = btree_iter_node(iter, level + 1); if (!parent) return NULL; - if (!bch2_btree_node_relock(iter, level + 1)) { - bch2_btree_iter_set_locks_want(iter, level + 2); - return ERR_PTR(-EINTR); - } + if (!bch2_btree_node_relock(iter, level + 1)) + goto out_upgrade; node_iter = iter->l[parent->level].iter; @@ -778,34 +784,66 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, : (bch2_btree_node_iter_advance(&node_iter, parent), bch2_btree_node_iter_peek_all(&node_iter, parent)); if (!k) - return NULL; + goto out; } while (bkey_deleted(k)); bch2_bkey_unpack(parent, &tmp.k, k); ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); - if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { - btree_node_unlock(iter, level); + if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) { + struct btree_iter *linked; - if (!bch2_btree_node_relock(iter, level + 1)) { - bch2_btree_iter_set_locks_want(iter, level + 2); - return ERR_PTR(-EINTR); + if (!bch2_btree_node_relock(iter, level + 1)) + goto out_upgrade; + + /* + * We might have got -EINTR because trylock failed, and we're + * holding other locks that would cause us to deadlock: + */ + for_each_linked_btree_iter(iter, linked) + if (btree_iter_cmp(iter, linked) < 0) + __bch2_btree_iter_unlock(linked); + + if (sib == btree_prev_sib) + btree_node_unlock(iter, level); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, + SIX_LOCK_intent); + + /* + * before btree_iter_relock() calls btree_iter_verify_locks(): + */ + if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, level + 1); + + if (!bch2_btree_node_relock(iter, level)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + if (!IS_ERR(ret)) { + six_unlock_intent(&ret->lock); + ret = ERR_PTR(-EINTR); + } } - ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); + bch2_btree_iter_relock(iter); } +out: + if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, level + 1); - if (!bch2_btree_node_relock(iter, level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + bch2_btree_iter_verify_locks(iter); - if (!IS_ERR(ret)) { - six_unlock_intent(&ret->lock); - ret = ERR_PTR(-EINTR); - } - } + BUG_ON((!may_drop_locks || !IS_ERR(ret)) && + (iter->uptodate >= BTREE_ITER_NEED_RELOCK || + !btree_node_locked(iter, level))); return ret; +out_upgrade: + if (may_drop_locks) + bch2_btree_iter_upgrade(iter, level + 2); + ret = ERR_PTR(-EINTR); + goto out; } void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index e021d6e9..43109d08 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, enum six_lock_type); struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, - struct btree *, + struct btree *, bool, enum btree_node_sibling); void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 02b14e38..969c1f19 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -148,6 +148,9 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, ? BCH_DATA_BTREE : BCH_DATA_USER; int ret = 0; + BUG_ON(journal_seq_verify(c) && + k.k->version.lo > journal_cur_seq(&c->journal)); + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c, "superblock not marked as containing replicas (type %u)", @@ -243,6 +246,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) unsigned max_stale; int ret = 0; + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); + + if (!c->btree_roots[btree_id].b) + return 0; + /* * if expensive_debug_checks is on, run range_checks on all leaf nodes: */ @@ -454,7 +462,7 @@ static void bch2_gc_start(struct bch_fs *c) * Indicates to buckets code that gc is now in progress - done under * usage_lock to avoid racing with bch2_mark_key(): */ - __gc_pos_set(c, GC_POS_MIN); + __gc_pos_set(c, gc_phase(GC_PHASE_START)); /* Save a copy of the existing bucket stats while we recompute them: */ for_each_member_device(ca, c, i) { @@ -535,22 +543,18 @@ void bch2_gc(struct bch_fs *c) bch2_gc_start(c); - /* Walk btree: */ - while (c->gc_pos.phase < (int) BTREE_ID_NR) { - int ret = c->btree_roots[c->gc_pos.phase].b - ? bch2_gc_btree(c, (int) c->gc_pos.phase) - : 0; + bch2_mark_superblocks(c); + /* Walk btree: */ + for (i = 0; i < BTREE_ID_NR; i++) { + int ret = bch2_gc_btree(c, i); if (ret) { bch_err(c, "btree gc failed: %d", ret); set_bit(BCH_FS_GC_FAILURE, &c->flags); goto out; } - - gc_pos_set(c, gc_phase(c->gc_pos.phase + 1)); } - bch2_mark_superblocks(c); bch2_mark_pending_btree_node_frees(c); bch2_mark_allocator_buckets(c); @@ -780,13 +784,13 @@ next: bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); /* Insert the newly coalesced nodes */ - bch2_btree_insert_node(as, parent, iter, &keylist); + bch2_btree_insert_node(as, parent, iter, &keylist, 0); BUG_ON(!bch2_keylist_empty(&keylist)); BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]); - BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0])); + bch2_btree_iter_node_replace(iter, new_nodes[0]); for (i = 0; i < nr_new_nodes; i++) bch2_btree_open_bucket_put(c, new_nodes[i]); @@ -1003,6 +1007,8 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) btree_node_range_checks_init(&r, 0); + gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0)); + if (!c->btree_roots[id].b) return 0; @@ -1041,36 +1047,33 @@ err: return bch2_btree_iter_unlock(&iter) ?: ret; } -static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal) +int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) { unsigned iter = 0; enum btree_id id; - int ret; + int ret = 0; - mutex_lock(&c->sb_lock); - if (!bch2_sb_get_replicas(c->disk_sb.sb)) { - if (BCH_SB_INITIALIZED(c->disk_sb.sb)) - bch_info(c, "building replicas info"); - set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - } - mutex_unlock(&c->sb_lock); + down_write(&c->gc_lock); again: bch2_gc_start(c); + bch2_mark_superblocks(c); + for (id = 0; id < BTREE_ID_NR; id++) { ret = bch2_initial_gc_btree(c, id); if (ret) - return ret; + goto err; } ret = bch2_journal_mark(c, journal); if (ret) - return ret; + goto err; if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { if (iter++ > 2) { bch_info(c, "Unable to fix bucket gens, looping"); - return -EINVAL; + ret = -EINVAL; + goto err; } bch_info(c, "Fixed gens, restarting initial mark and sweep:"); @@ -1085,21 +1088,9 @@ again: if (c->sb.encryption_type) atomic64_add(1 << 16, &c->key_version); - bch2_mark_superblocks(c); - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - - return 0; -} - -int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) -{ - int ret; - - down_write(&c->gc_lock); - ret = __bch2_initial_gc(c, journal); +err: up_write(&c->gc_lock); - return ret; } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 4d1ab9db..214a3fe3 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -46,8 +46,6 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) }; } -#define GC_POS_MIN gc_phase(0) - static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { if (l.phase != r.phase) @@ -59,17 +57,23 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) return 0; } +static inline struct gc_pos gc_pos_btree(enum btree_id id, + struct bpos pos, unsigned level) +{ + return (struct gc_pos) { + .phase = GC_PHASE_BTREE_EXTENTS + id, + .pos = pos, + .level = level, + }; +} + /* * GC position of the pointers within a btree node: note, _not_ for &b->key * itself, that lives in the parent node: */ static inline struct gc_pos gc_pos_btree_node(struct btree *b) { - return (struct gc_pos) { - .phase = b->btree_id, - .pos = b->key.k.p, - .level = b->level, - }; + return gc_pos_btree(b->btree_id, b->key.k.p, b->level); } /* @@ -81,11 +85,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) */ static inline struct gc_pos gc_pos_btree_root(enum btree_id id) { - return (struct gc_pos) { - .phase = (int) id, - .pos = POS_MAX, - .level = U8_MAX, - }; + return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); } static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 74ffad4c..0c825bcb 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -920,7 +920,7 @@ static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i, char *out = buf, *end = buf + len; out += scnprintf(out, end - out, - "error validating btree node %s " + "error validating btree node %s" "at btree %u level %u/%u\n" "pos %llu:%llu node offset %u", write ? "before write " : "", @@ -1120,7 +1120,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); btree_err(BTREE_ERR_FIXABLE, c, b, i, - "invalid bkey:\n%s\n%s", buf, invalid); + "invalid bkey:\n%s\n%s", invalid, buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 95ee9f61..682a9143 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -34,11 +34,9 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) EBUG_ON(iter->l[b->level].b != b); EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq); - for_each_linked_btree_node(iter, b, linked) + for_each_btree_iter_with_node(iter, b, linked) linked->lock_seq[b->level] += 2; - iter->lock_seq[b->level] += 2; - six_unlock_write(&b->lock); } @@ -48,6 +46,8 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) struct btree_iter *linked; unsigned readers = 0; + EBUG_ON(btree_node_read_locked(iter, b->level)); + for_each_linked_btree_iter(iter, linked) if (linked->l[b->level].b == b && btree_node_read_locked(linked, b->level)) @@ -66,15 +66,30 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) &b->lock.state.counter); } -bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +/* + * Lock a btree node if we already have it locked on one of our linked + * iterators: + */ +static inline bool btree_node_lock_increment(struct btree_iter *iter, + struct btree *b, unsigned level, + enum btree_node_locked_type want) { struct btree_iter *linked; - struct btree *b = iter->l[level].b; - int want = btree_lock_want(iter, level); - int have = btree_node_locked_type(iter, level); - if (want == have) - return true; + for_each_linked_btree_iter(iter, linked) + if (linked->l[level].b == b && + btree_node_locked_type(linked, level) >= want) { + six_lock_increment(&b->lock, want); + return true; + } + + return false; +} + +bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +{ + struct btree *b = iter->l[level].b; + int want = __btree_lock_want(iter, level); if (!is_btree_node(iter, level)) return false; @@ -82,42 +97,83 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) if (race_fault()) return false; - if (have != BTREE_NODE_UNLOCKED - ? six_trylock_convert(&b->lock, have, want) - : six_relock_type(&b->lock, want, iter->lock_seq[level])) - goto success; + if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) && + !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 && + btree_node_lock_increment(iter, b, level, want))) + return false; - for_each_linked_btree_iter(iter, linked) - if (linked->l[level].b == b && - btree_node_locked_type(linked, level) == want && - iter->lock_seq[level] == b->lock.state.seq) { - btree_node_unlock(iter, level); - six_lock_increment(&b->lock, want); - goto success; - } - - return false; -success: - mark_btree_node_unlocked(iter, level); mark_btree_node_locked(iter, level, want); return true; } -bool bch2_btree_iter_relock(struct btree_iter *iter) +static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) { - unsigned l; + struct btree *b = iter->l[level].b; - for (l = iter->level; - l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b; - l++) - if (!bch2_btree_node_relock(iter, l)) { + EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); + + if (!is_btree_node(iter, level)) + return false; + + if (race_fault()) + return false; + + if (btree_node_intent_locked(iter, level)) + return true; + + if (btree_node_locked(iter, level) + ? six_lock_tryupgrade(&b->lock) + : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level])) + goto success; + + if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 && + btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(iter, level); + goto success; + } + + return false; +success: + mark_btree_node_intent_locked(iter, level); + return true; +} + +static inline bool btree_iter_get_locks(struct btree_iter *iter, + bool upgrade) +{ + unsigned l = iter->level; + int fail_idx = -1; + + do { + if (!btree_iter_node(iter, l)) + break; + + if (!(upgrade + ? bch2_btree_node_upgrade(iter, l) + : bch2_btree_node_relock(iter, l))) { + fail_idx = l; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - return false; } + l++; + } while (l < iter->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_iter_traverse has to walk back up to + * the node that we failed to relock: + */ + while (fail_idx >= 0) { + btree_node_unlock(iter, fail_idx); + iter->l[fail_idx].b = BTREE_ITER_NOT_END; + --fail_idx; + } + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) iter->uptodate = BTREE_ITER_NEED_PEEK; - return true; + + bch2_btree_iter_verify_locks(iter); + return iter->uptodate < BTREE_ITER_NEED_RELOCK; } /* Slowpath: */ @@ -128,6 +184,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, { struct bch_fs *c = iter->c; struct btree_iter *linked; + bool ret = true; /* Can't have children locked before ancestors: */ EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked)); @@ -140,15 +197,11 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, EBUG_ON(type == SIX_LOCK_intent && iter->nodes_locked != iter->nodes_intent_locked); - for_each_linked_btree_iter(iter, linked) - if (linked->l[level].b == b && - btree_node_locked_type(linked, level) == type) { - six_lock_increment(&b->lock, type); - return true; - } + if (btree_node_lock_increment(iter, b, level, type)) + return true; /* - * Must lock btree nodes in key order - this case hapens when locking + * Must lock btree nodes in key order - this case happens when locking * the prev sibling in btree node merging: */ if (iter->nodes_locked && @@ -160,6 +213,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, if (!linked->nodes_locked) continue; + /* We have to lock btree nodes in key order: */ + if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) + ret = false; + /* * Can't block taking an intent lock if we have _any_ nodes read * locked: @@ -175,15 +232,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { linked->locks_want = max_t(unsigned, - linked->locks_want, - iter->locks_want); - return false; + linked->locks_want, + __fls(linked->nodes_locked) + 1); + btree_iter_get_locks(linked, true); + ret = false; } - /* We have to lock btree nodes in key order: */ - if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) - return false; - /* * Interior nodes must be locked before their descendants: if * another iterator has possible descendants locked of the node @@ -194,82 +248,133 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, linked->locks_want = max_t(unsigned, linked->locks_want, iter->locks_want); - return false; + btree_iter_get_locks(linked, true); + ret = false; } } - __btree_node_lock_type(c, b, type); - return true; + if (ret) + __btree_node_lock_type(c, b, type); + return ret; } /* Btree iterator locking: */ -static void btree_iter_drop_extra_locks(struct btree_iter *iter) +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_iter_verify_locks(struct btree_iter *iter) { unsigned l; - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) > iter->locks_want) { - if (l > iter->level) { - btree_node_unlock(iter, l); - } else { - if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->l[l].b->lock); - iter->nodes_intent_locked ^= 1 << l; - } - break; - } + if (iter->uptodate == BTREE_ITER_END) { + BUG_ON(iter->nodes_locked); + return; + } + + for (l = 0; btree_iter_node(iter, l); l++) { + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && + !btree_node_locked(iter, l)) + continue; + + BUG_ON(btree_lock_want(iter, l) != + btree_node_locked_type(iter, l)); } } +#endif -bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter, - unsigned new_locks_want) +__flatten +static bool __bch2_btree_iter_relock(struct btree_iter *iter) +{ + if (iter->uptodate < BTREE_ITER_NEED_RELOCK) + return true; + + if (iter->uptodate > BTREE_ITER_NEED_TRAVERSE) + return false; + + return btree_iter_get_locks(iter, false); +} + +bool bch2_btree_iter_relock(struct btree_iter *iter) +{ + struct btree_iter *linked; + bool ret = true; + + for_each_btree_iter(iter, linked) + ret &= __bch2_btree_iter_relock(linked); + + return ret; +} + +bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) { struct btree_iter *linked; - /* Drop locks we don't want anymore: */ - if (new_locks_want < iter->locks_want) - for_each_linked_btree_iter(iter, linked) - if (linked->locks_want > new_locks_want) { - linked->locks_want = max_t(unsigned, 1, - new_locks_want); - btree_iter_drop_extra_locks(linked); - } + EBUG_ON(iter->locks_want >= new_locks_want); iter->locks_want = new_locks_want; - btree_iter_drop_extra_locks(iter); - if (bch2_btree_iter_relock(iter)) + if (btree_iter_get_locks(iter, true)) return true; /* - * Just an optimization: ancestor nodes must be locked before child - * nodes, so set locks_want on iterators that might lock ancestors - * before us to avoid getting -EINTR later: + * Ancestor nodes must be locked before child nodes, so set locks_want + * on iterators that might lock ancestors before us to avoid getting + * -EINTR later: */ for_each_linked_btree_iter(iter, linked) if (linked->btree_id == iter->btree_id && - btree_iter_cmp(linked, iter) <= 0) - linked->locks_want = max_t(unsigned, linked->locks_want, - new_locks_want); + btree_iter_cmp(linked, iter) <= 0 && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_iter_get_locks(linked, true); + } + return false; } -static void __bch2_btree_iter_unlock(struct btree_iter *iter) +void __bch2_btree_iter_downgrade(struct btree_iter *iter, + unsigned downgrade_to) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + struct btree_iter *linked; + unsigned l; - while (iter->nodes_locked) - btree_node_unlock(iter, __ffs(iter->nodes_locked)); + /* + * We downgrade linked iterators as well because btree_iter_upgrade + * might have had to modify locks_want on linked iterators due to lock + * ordering: + */ + for_each_btree_iter(iter, linked) { + unsigned new_locks_want = downgrade_to ?: + (linked->flags & BTREE_ITER_INTENT ? 1 : 0); + + if (linked->locks_want <= new_locks_want) + continue; + + linked->locks_want = new_locks_want; + + while (linked->nodes_locked && + (l = __fls(linked->nodes_locked)) >= linked->locks_want) { + if (l > linked->level) { + btree_node_unlock(linked, l); + } else { + if (btree_node_intent_locked(linked, l)) { + six_lock_downgrade(&linked->l[l].b->lock); + linked->nodes_intent_locked ^= 1 << l; + } + break; + } + } + + bch2_btree_iter_verify_locks(linked); + } } int bch2_btree_iter_unlock(struct btree_iter *iter) { struct btree_iter *linked; - for_each_linked_btree_iter(iter, linked) + for_each_btree_iter(iter, linked) __bch2_btree_iter_unlock(linked); - __bch2_btree_iter_unlock(iter); return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; } @@ -320,11 +425,8 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) { struct btree_iter *linked; - if (iter->l[b->level].b == b) - __bch2_btree_iter_verify(iter, b); - - for_each_linked_btree_node(iter, b, linked) - __bch2_btree_iter_verify(iter, b); + for_each_btree_iter_with_node(iter, b, linked) + __bch2_btree_iter_verify(linked, b); } #endif @@ -456,12 +558,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - if (iter->l[b->level].b == b) - __bch2_btree_node_iter_fix(iter, b, - &iter->l[b->level].iter, t, - where, clobber_u64s, new_u64s); - - for_each_linked_btree_node(iter, b, linked) + for_each_btree_iter_with_node(iter, b, linked) __bch2_btree_node_iter_fix(linked, b, &linked->l[b->level].iter, t, where, clobber_u64s, new_u64s); @@ -613,11 +710,12 @@ static inline void btree_iter_node_set(struct btree_iter *iter, * A btree node is being replaced - update the iterator to point to the new * node: */ -bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) +void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) { + enum btree_node_locked_type t; struct btree_iter *linked; - for_each_linked_btree_iter(iter, linked) + for_each_btree_iter(iter, linked) if (btree_iter_pos_in_node(linked, b)) { /* * bch2_btree_iter_node_drop() has already been called - @@ -626,52 +724,28 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) */ BUG_ON(btree_node_locked(linked, b->level)); - /* - * If @linked wants this node read locked, we don't want - * to actually take the read lock now because it's not - * legal to hold read locks on other nodes while we take - * write locks, so the journal can make forward - * progress... - * - * Instead, btree_iter_node_set() sets things up so - * bch2_btree_node_relock() will succeed: - */ - - if (btree_want_intent(linked, b->level)) { - six_lock_increment(&b->lock, SIX_LOCK_intent); - mark_btree_node_intent_locked(linked, b->level); + t = btree_lock_want(linked, b->level); + if (t != BTREE_NODE_UNLOCKED) { + six_lock_increment(&b->lock, t); + mark_btree_node_locked(linked, b->level, t); } btree_iter_node_set(linked, b); } - if (!btree_iter_pos_in_node(iter, b)) { - six_unlock_intent(&b->lock); - return false; - } - - mark_btree_node_intent_locked(iter, b->level); - btree_iter_node_set(iter, b); - return true; -} - -void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - - for_each_linked_btree_iter(iter, linked) - bch2_btree_iter_node_drop(linked, b); + six_unlock_intent(&b->lock); } void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) { + struct btree_iter *linked; unsigned level = b->level; - if (iter->l[level].b == b) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - btree_node_unlock(iter, level); - iter->l[level].b = BTREE_ITER_NOT_END; - } + for_each_btree_iter(iter, linked) + if (linked->l[level].b == b) { + btree_node_unlock(linked, level); + linked->l[level].b = BTREE_ITER_NOT_END; + } } /* @@ -682,9 +756,8 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) { struct btree_iter *linked; - for_each_linked_btree_node(iter, b, linked) + for_each_btree_iter_with_node(iter, b, linked) __btree_iter_init(linked, b); - __btree_iter_init(iter, b); } static inline int btree_iter_lock_root(struct btree_iter *iter, @@ -713,7 +786,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, return 0; } - lock_type = btree_lock_want(iter, iter->level); + lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, iter, lock_type))) return -EINTR; @@ -771,7 +844,7 @@ static inline int btree_iter_down(struct btree_iter *iter) struct btree_iter_level *l = &iter->l[iter->level]; struct btree *b; unsigned level = iter->level - 1; - enum six_lock_type lock_type = btree_lock_want(iter, level); + enum six_lock_type lock_type = __btree_lock_want(iter, level); BKEY_PADDED(k) tmp; BUG_ON(!btree_node_locked(iter, iter->level)); @@ -799,6 +872,12 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } +static void btree_iter_set_end(struct btree_iter *iter) +{ + iter->uptodate = BTREE_ITER_END; + __bch2_btree_iter_unlock(iter); +} + int __must_check __bch2_btree_iter_traverse(struct btree_iter *); static int btree_iter_traverse_error(struct btree_iter *iter, int ret) @@ -871,7 +950,7 @@ io_error: BUG_ON(ret != -EIO); iter->flags |= BTREE_ITER_ERROR; - iter->l[iter->level].b = NULL; + iter->l[iter->level].b = BTREE_ITER_NOT_END; goto out; } @@ -888,9 +967,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { unsigned depth_want = iter->level; - if (unlikely(!iter->l[iter->level].b)) + if (unlikely(iter->uptodate == BTREE_ITER_END)) return 0; + BUG_ON(iter->level >= BTREE_MAX_DEPTH); + BUG_ON(!iter->l[iter->level].b); + iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; /* make sure we have all the intent locks we need - ugh */ @@ -959,6 +1041,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) } iter->uptodate = BTREE_ITER_NEED_PEEK; + bch2_btree_iter_verify_locks(iter); return 0; } @@ -966,13 +1049,15 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - if (iter->uptodate < BTREE_ITER_NEED_RELOCK) + if (__bch2_btree_iter_relock(iter)) return 0; ret = __bch2_btree_iter_traverse(iter); if (unlikely(ret)) ret = btree_iter_traverse_error(iter, ret); + BUG_ON(ret == -EINTR && !btree_iter_linked(iter)); + return ret; } @@ -984,18 +1069,29 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) int ret; EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + bch2_btree_iter_verify_locks(iter); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return iter->l[iter->level].b; + + if (unlikely(iter->uptodate == BTREE_ITER_END)) + return NULL; ret = bch2_btree_iter_traverse(iter); if (ret) return ERR_PTR(ret); b = iter->l[iter->level].b; - - if (b) { - EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = b->key.k.p; + if (!b) { + btree_iter_set_end(iter); + return NULL; } + BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + + iter->pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; + return b; } @@ -1005,24 +1101,39 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) int ret; EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + bch2_btree_iter_verify_locks(iter); btree_iter_up(iter); - if (!btree_iter_node(iter, iter->level)) + if (!btree_iter_node(iter, iter->level)) { + btree_iter_set_end(iter); return NULL; + } - /* parent node usually won't be locked: redo traversal if necessary */ - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = bch2_btree_iter_traverse(iter); - if (ret) - return NULL; + if (!bch2_btree_node_relock(iter, iter->level)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = bch2_btree_iter_traverse(iter); + if (ret) + return NULL; + } b = iter->l[iter->level].b; - if (!b) - return b; + BUG_ON(!b); if (bkey_cmp(iter->pos, b->key.k.p) < 0) { - /* Haven't gotten to the end of the parent node: */ + /* + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ + + /* + * We don't really want to be unlocking here except we can't + * directly tell btree_iter_traverse() "traverse to this level" + * except by setting iter->level, so we have to unlock so we + * don't screw up our lock invariants: + */ + if (btree_node_read_locked(iter, iter->level)) + btree_node_unlock(iter, iter->level); /* ick: */ iter->pos = iter->btree_id == BTREE_ID_INODES @@ -1086,8 +1197,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); EBUG_ON(iter->flags & BTREE_ITER_SLOTS); - EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && - !btree_node_locked(iter, 0)); + bch2_btree_iter_verify_locks(iter); if (iter->uptodate == BTREE_ITER_UPTODATE) { struct bkey_packed *k = @@ -1117,7 +1227,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) /* got to the end of the leaf, iterator needs to be traversed: */ iter->pos = l->b->key.k.p; if (!bkey_cmp(iter->pos, POS_MAX)) { - iter->uptodate = BTREE_ITER_END; + btree_iter_set_end(iter); return bkey_s_c_null; } @@ -1144,7 +1254,7 @@ struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) iter->pos = l->b->key.k.p; if (!bkey_cmp(iter->pos, POS_MAX)) { - iter->uptodate = BTREE_ITER_END; + btree_iter_set_end(iter); return bkey_s_c_null; } @@ -1163,6 +1273,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); EBUG_ON(iter->flags & BTREE_ITER_SLOTS); + bch2_btree_iter_verify_locks(iter); if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { k = bch2_btree_iter_peek(iter); @@ -1225,7 +1336,7 @@ recheck: if (iter->flags & BTREE_ITER_IS_EXTENTS) { if (n.p.offset == KEY_OFFSET_MAX) { if (n.p.inode == KEY_INODE_MAX) { - iter->uptodate = BTREE_ITER_END; + btree_iter_set_end(iter); return bkey_s_c_null; } @@ -1259,8 +1370,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); - EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && - !btree_node_locked(iter, 0)); + bch2_btree_iter_verify_locks(iter); if (iter->uptodate == BTREE_ITER_UPTODATE) { struct bkey_s_c ret = { .k = &iter->k }; @@ -1286,6 +1396,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { + EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != + (iter->btree_id == BTREE_ID_EXTENTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); + bch2_btree_iter_verify_locks(iter); + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { @@ -1347,13 +1462,11 @@ void bch2_btree_iter_unlink(struct btree_iter *iter) if (!btree_iter_linked(iter)) return; - for_each_linked_btree_iter(iter, linked) { - + for_each_linked_btree_iter(iter, linked) if (linked->next == iter) { linked->next = iter->next; return; } - } BUG(); } @@ -1366,9 +1479,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) iter->next = new; if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - unsigned nr_iters = 1; + unsigned nr_iters = 0; - for_each_linked_btree_iter(iter, new) + for_each_btree_iter(iter, new) nr_iters++; BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 0097a2a2..99e51b27 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -28,40 +28,47 @@ static inline bool btree_iter_linked(const struct btree_iter *iter) return iter->next != iter; } -/** - * for_each_linked_btree_iter - iterate over all iterators linked with @_iter - */ -#define for_each_linked_btree_iter(_iter, _linked) \ - for ((_linked) = (_iter)->next; \ - (_linked) != (_iter); \ - (_linked) = (_linked)->next) +static inline bool __iter_has_node(const struct btree_iter *iter, + const struct btree *b) +{ + /* + * We don't compare the low bits of the lock sequence numbers because + * @iter might have taken a write lock on @b, and we don't want to skip + * the linked iterator if the sequence numbers were equal before taking + * that write lock. The lock sequence number is incremented by taking + * and releasing write locks and is even when unlocked: + */ + + return iter->l[b->level].b == b && + iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1; +} static inline struct btree_iter * -__next_linked_btree_node(struct btree_iter *iter, struct btree *b, - struct btree_iter *linked) +__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked) { - do { - linked = linked->next; + return linked->next != iter ? linked->next : NULL; +} - if (linked == iter) - return NULL; - - /* - * We don't compare the low bits of the lock sequence numbers - * because @iter might have taken a write lock on @b, and we - * don't want to skip the linked iterator if the sequence - * numbers were equal before taking that write lock. The lock - * sequence number is incremented by taking and releasing write - * locks and is even when unlocked: - */ - } while (linked->l[b->level].b != b || - linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1); +static inline struct btree_iter * +__next_iter_with_node(struct btree_iter *iter, struct btree *b, + struct btree_iter *linked) +{ + while (linked && !__iter_has_node(linked, b)) + linked = __next_linked_iter(iter, linked); return linked; } /** - * for_each_linked_btree_node - iterate over all iterators linked with @_iter + * for_each_btree_iter - iterate over all iterators linked with @_iter, + * including @_iter + */ +#define for_each_btree_iter(_iter, _linked) \ + for ((_linked) = (_iter); (_linked); \ + (_linked) = __next_linked_iter(_iter, _linked)) + +/** + * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter * that also point to @_b * * @_b is assumed to be locked by @_iter @@ -69,15 +76,27 @@ __next_linked_btree_node(struct btree_iter *iter, struct btree *b, * Filters out iterators that don't have a valid btree_node iterator for @_b - * i.e. iterators for which bch2_btree_node_relock() would not succeed. */ -#define for_each_linked_btree_node(_iter, _b, _linked) \ +#define for_each_btree_iter_with_node(_iter, _b, _linked) \ for ((_linked) = (_iter); \ - ((_linked) = __next_linked_btree_node(_iter, _b, _linked));) + ((_linked) = __next_iter_with_node(_iter, _b, _linked)); \ + (_linked) = __next_linked_iter(_iter, _linked)) + +/** + * for_each_linked_btree_iter - iterate over all iterators linked with @_iter, + * _not_ including @_iter + */ +#define for_each_linked_btree_iter(_iter, _linked) \ + for ((_linked) = (_iter)->next; \ + (_linked) != (_iter); \ + (_linked) = (_linked)->next) #ifdef CONFIG_BCACHEFS_DEBUG void bch2_btree_iter_verify(struct btree_iter *, struct btree *); +void bch2_btree_iter_verify_locks(struct btree_iter *); #else static inline void bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} + struct btree *b) {} +static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} #endif void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, @@ -85,22 +104,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, struct bkey_packed *, unsigned, unsigned); int bch2_btree_iter_unlock(struct btree_iter *); -bool __bch2_btree_iter_set_locks_want(struct btree_iter *, unsigned); -static inline bool bch2_btree_iter_set_locks_want(struct btree_iter *iter, - unsigned new_locks_want) +bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); + +static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) { new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (iter->locks_want == new_locks_want && - iter->nodes_intent_locked == (1 << new_locks_want) - 1) - return true; - - return __bch2_btree_iter_set_locks_want(iter, new_locks_want); + return iter->locks_want < new_locks_want + ? __bch2_btree_iter_upgrade(iter, new_locks_want) + : iter->uptodate <= BTREE_ITER_NEED_PEEK; } -bool bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -void bch2_btree_iter_node_drop_linked(struct btree_iter *, struct btree *); +void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); + +static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) +{ + if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) + __bch2_btree_iter_downgrade(iter, 0); +} + +void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index f48084bc..1d975207 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -75,16 +75,23 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter, mark_btree_node_locked(iter, level, SIX_LOCK_intent); } -static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level) +static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) { return level < iter->locks_want ? SIX_LOCK_intent : SIX_LOCK_read; } -static inline bool btree_want_intent(struct btree_iter *iter, int level) +static inline enum btree_node_locked_type +btree_lock_want(struct btree_iter *iter, int level) { - return btree_lock_want(iter, level) == SIX_LOCK_intent; + if (level < iter->level) + return BTREE_NODE_UNLOCKED; + if (level < iter->locks_want) + return BTREE_NODE_INTENT_LOCKED; + if (level == iter->level) + return BTREE_NODE_READ_LOCKED; + return BTREE_NODE_UNLOCKED; } static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) @@ -98,6 +105,14 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) mark_btree_node_unlocked(iter, level); } +static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) +{ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + while (iter->nodes_locked) + btree_node_unlock(iter, __ffs(iter->nodes_locked)); +} + static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) { switch (type) { @@ -150,8 +165,11 @@ bool __bch2_btree_node_relock(struct btree_iter *, unsigned); static inline bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { - return likely(btree_lock_want(iter, level) == - btree_node_locked_type(iter, level)) || + EBUG_ON(btree_node_locked(iter, level) && + btree_node_locked_type(iter, level) != + __btree_lock_want(iter, level)); + + return likely(btree_node_locked(iter, level)) || __bch2_btree_node_relock(iter, level); } diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index f357095d..aac97958 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -85,31 +85,49 @@ int __bch2_btree_insert_at(struct btree_insert *); __VA_ARGS__ \ }}) +enum { + __BTREE_INSERT_ATOMIC, + __BTREE_INSERT_NOUNLOCK, + __BTREE_INSERT_NOFAIL, + __BTREE_INSERT_USE_RESERVE, + __BTREE_INSERT_USE_ALLOC_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + /* - * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent - * locks, -EAGAIN if need to wait on btree reserve + * Don't drop/retake locks before doing btree update, instead return -EINTR if + * we had to drop locks for any reason */ -#define BTREE_INSERT_ATOMIC (1 << 0) +#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC) + +/* + * Don't drop locks _after_ successfully updating btree: + */ +#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) /* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL (1 << 1) +#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) /* for copygc, or when merging btree nodes */ -#define BTREE_INSERT_USE_RESERVE (1 << 2) -#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3) +#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) +#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) /* * Insert is for journal replay: don't get journal reservations, or mark extents * (bch_mark_key) */ -#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4) +#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) /* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT (1 << 5) -#define BTREE_INSERT_GC_LOCK_HELD (1 << 6) +#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) +#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) -#define BCH_HASH_SET_MUST_CREATE (1 << 7) -#define BCH_HASH_SET_MUST_REPLACE (1 << 8) +#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) int bch2_btree_delete_at(struct btree_iter *, unsigned); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 92e19c4e..3e13f784 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -223,8 +223,7 @@ found: mutex_unlock(&c->btree_interior_update_lock); } -static void __btree_node_free(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) +static void __btree_node_free(struct bch_fs *c, struct btree *b) { trace_btree_node_free(c, b); @@ -237,21 +236,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, clear_btree_node_noevict(b); - btree_node_lock_type(c, b, SIX_LOCK_write); - bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_lock(&c->btree_cache.lock); list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); - - /* - * By using six_unlock_write() directly instead of - * bch2_btree_node_unlock_write(), we don't update the iterator's - * sequence numbers and cause future bch2_btree_node_relock() calls to - * fail: - */ - six_unlock_write(&b->lock); } void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) @@ -264,7 +253,9 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) clear_btree_node_dirty(b); - __btree_node_free(c, b, NULL); + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->lock); bch2_open_bucket_put_refs(c, &ob.nr, ob.refs); } @@ -283,9 +274,9 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, */ btree_update_drop_new_node(c, b); - bch2_btree_iter_node_drop_linked(iter, b); - - __btree_node_free(c, b, iter); + __bch2_btree_node_lock_write(b, iter); + __btree_node_free(c, b); + six_unlock_write(&b->lock); bch2_btree_iter_node_drop(iter, b); } @@ -499,7 +490,9 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser bch2_btree_open_bucket_put(c, b); } - __btree_node_free(c, b, NULL); + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->lock); six_unlock_intent(&b->lock); } @@ -1362,7 +1355,8 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, } static void btree_split(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys) + struct btree_iter *iter, struct keylist *keys, + unsigned flags) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(iter, b); @@ -1425,7 +1419,7 @@ static void btree_split(struct btree_update *as, struct btree *b, if (parent) { /* Split a non root node */ - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); } else if (n3) { bch2_btree_set_root(as, n3, iter); } else { @@ -1491,9 +1485,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, btree_update_updated_node(as, b); - for_each_linked_btree_node(iter, b, linked) + for_each_btree_iter_with_node(iter, b, linked) bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); - bch2_btree_node_iter_peek(&iter->l[b->level].iter, b); bch2_btree_iter_verify(iter, b); } @@ -1511,7 +1504,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * for leaf nodes -- inserts into interior nodes have to be atomic. */ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys) + struct btree_iter *iter, struct keylist *keys, + unsigned flags) { struct bch_fs *c = as->c; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); @@ -1551,14 +1545,14 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, btree_node_interior_verify(b); - bch2_foreground_maybe_merge(c, iter, b->level); + bch2_foreground_maybe_merge(c, iter, b->level, flags); return; split: - btree_split(as, b, iter, keys); + btree_split(as, b, iter, keys, flags); } int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, - unsigned btree_reserve_flags) + unsigned flags) { struct btree *b = iter->l[0].b; struct btree_update *as; @@ -1570,16 +1564,17 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, * We already have a disk reservation and open buckets pinned; this * allocation must not block: */ - for_each_linked_btree_iter(iter, linked) + for_each_btree_iter(iter, linked) if (linked->btree_id == BTREE_ID_EXTENTS) - btree_reserve_flags |= BTREE_INSERT_USE_RESERVE; - if (iter->btree_id == BTREE_ID_EXTENTS) - btree_reserve_flags |= BTREE_INSERT_USE_RESERVE; + flags |= BTREE_INSERT_USE_RESERVE; closure_init_stack(&cl); /* Hack, because gc and splitting nodes doesn't mix yet: */ if (!down_read_trylock(&c->gc_lock)) { + if (flags & BTREE_INSERT_NOUNLOCK) + return -EINTR; + bch2_btree_iter_unlock(iter); down_read(&c->gc_lock); @@ -1591,39 +1586,43 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ret = -EINTR; goto out; } as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - btree_reserve_flags, &cl); + btree_update_reserve_required(c, b), flags, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (IS_ERR(as)) { ret = PTR_ERR(as); if (ret == -EAGAIN) { + BUG_ON(flags & BTREE_INSERT_NOUNLOCK); bch2_btree_iter_unlock(iter); - up_read(&c->gc_lock); - closure_sync(&cl); - return -EINTR; + ret = -EINTR; } goto out; } - btree_split(as, b, iter, NULL); + btree_split(as, b, iter, NULL, flags); bch2_btree_update_done(as); - bch2_btree_iter_set_locks_want(iter, 1); + /* + * We haven't successfully inserted yet, so don't downgrade all the way + * back to read locks; + */ + __bch2_btree_iter_downgrade(iter, 1); out: up_read(&c->gc_lock); closure_sync(&cl); return ret; } -int __bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - unsigned level, - enum btree_node_sibling sib) +void __bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) { struct btree_update *as; struct bkey_format_state new_s; @@ -1636,29 +1635,29 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c, closure_init_stack(&cl); retry: - if (!bch2_btree_node_relock(iter, level)) - return 0; + BUG_ON(!btree_node_locked(iter, level)); b = iter->l[level].b; parent = btree_node_parent(iter, b); if (!parent) - return 0; + goto out; if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - return 0; + goto out; /* XXX: can't be holding read locks */ - m = bch2_btree_node_get_sibling(c, iter, b, sib); + m = bch2_btree_node_get_sibling(c, iter, b, + !(flags & BTREE_INSERT_NOUNLOCK), sib); if (IS_ERR(m)) { ret = PTR_ERR(m); - goto out; + goto err; } /* NULL means no sibling: */ if (!m) { b->sib_u64s[sib] = U16_MAX; - return 0; + goto out; } if (sib == btree_prev_sib) { @@ -1688,33 +1687,26 @@ retry: if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { six_unlock_intent(&m->lock); - return 0; - } - - /* We're changing btree topology, doesn't mix with gc: */ - if (!down_read_trylock(&c->gc_lock)) { - six_unlock_intent(&m->lock); - bch2_btree_iter_unlock(iter); - - down_read(&c->gc_lock); - up_read(&c->gc_lock); - ret = -EINTR; goto out; } - if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) { + /* We're changing btree topology, doesn't mix with gc: */ + if (!down_read_trylock(&c->gc_lock)) + goto err_cycle_gc_lock; + + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ret = -EINTR; - goto out_unlock; + goto err_unlock; } as = bch2_btree_update_start(c, iter->btree_id, btree_update_reserve_required(c, parent) + 1, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE, - &cl); + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (IS_ERR(as)) { ret = PTR_ERR(as); - goto out_unlock; + goto err_unlock; } trace_btree_merge(c, b); @@ -1744,7 +1736,7 @@ retry: bch2_btree_node_write(c, n, SIX_LOCK_intent); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); bch2_btree_open_bucket_put(c, n); bch2_btree_node_free_inmem(c, b, iter); @@ -1754,26 +1746,53 @@ retry: bch2_btree_iter_verify(iter, n); bch2_btree_update_done(as); -out_unlock: - if (ret != -EINTR && ret != -EAGAIN) - bch2_btree_iter_set_locks_want(iter, 1); + six_unlock_intent(&m->lock); up_read(&c->gc_lock); out: - if (ret == -EAGAIN || ret == -EINTR) { - bch2_btree_iter_unlock(iter); - ret = -EINTR; - } - + /* + * Don't downgrade locks here: we're called after successful insert, + * and the caller will downgrade locks after a successful insert + * anyways (in case e.g. a split was required first) + * + * And we're also called when inserting into interior nodes in the + * split path, and downgrading to read locks in there is potentially + * confusing: + */ closure_sync(&cl); + return; - if (ret == -EINTR) { +err_cycle_gc_lock: + six_unlock_intent(&m->lock); + + if (flags & BTREE_INSERT_NOUNLOCK) + goto out; + + bch2_btree_iter_unlock(iter); + + down_read(&c->gc_lock); + up_read(&c->gc_lock); + ret = -EINTR; + goto err; + +err_unlock: + six_unlock_intent(&m->lock); + up_read(&c->gc_lock); +err: + BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); + + if ((ret == -EAGAIN || ret == -EINTR) && + !(flags & BTREE_INSERT_NOUNLOCK)) { + bch2_btree_iter_unlock(iter); + closure_sync(&cl); ret = bch2_btree_iter_traverse(iter); - if (!ret) - goto retry; + if (ret) + goto out; + + goto retry; } - return ret; + goto out; } static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, @@ -1806,7 +1825,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); } else { bch2_btree_set_root(as, n, iter); } @@ -1815,7 +1834,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, bch2_btree_node_free_inmem(c, b, iter); - BUG_ON(!bch2_btree_iter_node_replace(iter, n)); + bch2_btree_iter_node_replace(iter, n); bch2_btree_update_done(as); return 0; @@ -1830,7 +1849,6 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, __le64 seq, unsigned flags) { - unsigned locks_want = iter->locks_want; struct closure cl; struct btree *b; int ret; @@ -1839,7 +1857,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); - bch2_btree_iter_set_locks_want(iter, U8_MAX); + bch2_btree_iter_upgrade(iter, U8_MAX); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { if (!down_read_trylock(&c->gc_lock)) { @@ -1866,7 +1884,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, closure_sync(&cl); } - bch2_btree_iter_set_locks_want(iter, locks_want); + bch2_btree_iter_downgrade(iter); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) up_read(&c->gc_lock); @@ -1920,7 +1938,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, } bch2_keylist_add(&as->parent_keys, &new_key->k_i); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -1982,6 +2000,9 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) + return -EINTR; + if (!down_read_trylock(&c->gc_lock)) { bch2_btree_iter_unlock(iter); down_read(&c->gc_lock); @@ -2041,6 +2062,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, goto err_free_update; __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); + + bch2_btree_iter_downgrade(iter); err: if (new_hash) { mutex_lock(&c->btree_cache.lock); diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index abf14e4c..3a17de5c 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -146,35 +146,51 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *, struct btree *); void bch2_btree_insert_node(struct btree_update *, struct btree *, - struct btree_iter *, struct keylist *); + struct btree_iter *, struct keylist *, + unsigned); int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, - unsigned, enum btree_node_sibling); +void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, + unsigned, unsigned, enum btree_node_sibling); -static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, +static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, struct btree_iter *iter, - unsigned level, + unsigned level, unsigned flags, enum btree_node_sibling sib) { struct btree *b; + /* + * iterators are inconsistent when they hit end of leaf, until + * traversed again + * + * XXX inconsistent how? + */ + if (iter->flags & BTREE_ITER_AT_END_OF_LEAF) + return; + + if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) + return; + if (!bch2_btree_node_relock(iter, level)) - return 0; + return; b = iter->l[level].b; if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - return 0; + return; - return __bch2_foreground_maybe_merge(c, iter, level, sib); + __bch2_foreground_maybe_merge(c, iter, level, flags, sib); } static inline void bch2_foreground_maybe_merge(struct bch_fs *c, struct btree_iter *iter, - unsigned level) + unsigned level, + unsigned flags) { - bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib); - bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib); + bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_prev_sib); + bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_next_sib); } void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index cc41140f..a62d8307 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -227,19 +227,36 @@ btree_insert_key_leaf(struct btree_insert *trans, return ret; } +#define trans_for_each_entry(trans, i) \ + for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) + +/* + * We sort transaction entries so that if multiple iterators point to the same + * leaf node they'll be adjacent: + */ static bool same_leaf_as_prev(struct btree_insert *trans, struct btree_insert_entry *i) { - /* - * Because we sorted the transaction entries, if multiple iterators - * point to the same leaf node they'll always be adjacent now: - */ return i != trans->entries && i[0].iter->l[0].b == i[-1].iter->l[0].b; } -#define trans_for_each_entry(trans, i) \ - for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) +static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans, + struct btree_insert_entry *i) +{ + struct btree *b = i->iter->l[0].b; + + do { + i++; + } while (i < trans->entries + trans->nr && b == i->iter->l[0].b); + + return i; +} + +#define trans_for_each_leaf(trans, i) \ + for ((i) = (trans)->entries; \ + (i) < (trans)->entries + (trans)->nr; \ + (i) = trans_next_leaf(trans, i)) inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) @@ -262,19 +279,16 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans) { struct btree_insert_entry *i; - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, - i->iter); + trans_for_each_leaf(trans, i) + bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); } static void multi_unlock_write(struct btree_insert *trans) { struct btree_insert_entry *i; - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); + trans_for_each_leaf(trans, i) + bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); } static inline int btree_trans_cmp(struct btree_insert_entry l, @@ -285,6 +299,107 @@ static inline int btree_trans_cmp(struct btree_insert_entry l, /* Normal update interface: */ +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_btree_insert_at(struct btree_insert *trans, + struct btree_iter **split, + bool *cycle_gc_lock) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + unsigned u64s; + int ret; + + trans_for_each_entry(trans, i) + BUG_ON(i->done); + + u64s = 0; + trans_for_each_entry(trans, i) + u64s += jset_u64s(i->k->k.u64s + i->extra_res); + + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) + ? bch2_journal_res_get(&c->journal, + &trans->journal_res, + u64s, u64s) + : 0; + if (ret) + return ret; + + multi_lock_write(c, trans); + + if (race_fault()) { + ret = -EINTR; + goto out; + } + + u64s = 0; + trans_for_each_entry(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + /* + * bch2_btree_node_insert_fits() must be called under write lock: + * with only an intent lock, another thread can still call + * bch2_btree_node_write(), converting an unwritten bset to a + * written one + */ + u64s += i->k->k.u64s + i->extra_res; + if (!bch2_btree_node_insert_fits(c, + i->iter->l[0].b, u64s)) { + ret = -EINTR; + *split = i->iter; + goto out; + } + } + + if (journal_seq_verify(c) && + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + trans_for_each_entry(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + + trans_for_each_entry(trans, i) { + switch (btree_insert_key_leaf(trans, i)) { + case BTREE_INSERT_OK: + i->done = true; + break; + case BTREE_INSERT_JOURNAL_RES_FULL: + case BTREE_INSERT_NEED_TRAVERSE: + case BTREE_INSERT_NEED_RESCHED: + ret = -EINTR; + break; + case BTREE_INSERT_BTREE_NODE_FULL: + ret = -EINTR; + *split = i->iter; + break; + case BTREE_INSERT_ENOSPC: + ret = -ENOSPC; + break; + case BTREE_INSERT_NEED_GC_LOCK: + ret = -EINTR; + *cycle_gc_lock = true; + break; + default: + BUG(); + } + + /* + * If we did some work (i.e. inserted part of an extent), + * we have to do all the other updates as well: + */ + if (!trans->did_work && (ret || *split)) + break; + } +out: + multi_unlock_write(trans); + bch2_journal_res_put(&c->journal, &trans->journal_res); + + return ret; +} + /** * __bch_btree_insert_at - insert keys at given iterator positions * @@ -300,194 +415,142 @@ int __bch2_btree_insert_at(struct btree_insert *trans) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct btree_iter *split = NULL; + struct btree_iter *linked, *split = NULL; bool cycle_gc_lock = false; - unsigned u64s; + unsigned flags; int ret; + for_each_btree_iter(trans->entries[0].iter, linked) + bch2_btree_iter_verify_locks(linked); + + /* for the sake of sanity: */ + BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); + trans_for_each_entry(trans, i) { BUG_ON(i->iter->level); BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); BUG_ON(debug_check_bkeys(c) && bch2_bkey_invalid(c, i->iter->btree_id, bkey_i_to_s_c(i->k))); + BUG_ON(i->iter->uptodate == BTREE_ITER_END); } bubble_sort(trans->entries, trans->nr, btree_trans_cmp); if (unlikely(!percpu_ref_tryget(&c->writes))) return -EROFS; -retry_locks: - ret = -EINTR; - trans_for_each_entry(trans, i) { - if (!bch2_btree_iter_set_locks_want(i->iter, 1)) - goto err; - - if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) { - ret = bch2_btree_iter_traverse(i->iter); - if (ret) - goto err; - } - } retry: - trans->did_work = false; - u64s = 0; - trans_for_each_entry(trans, i) - if (!i->done) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - - ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) - ? bch2_journal_res_get(&c->journal, - &trans->journal_res, - u64s, u64s) - : 0; - if (ret) - goto err; - - multi_lock_write(c, trans); - - if (race_fault()) { - ret = -EINTR; - goto unlock; - } - - u64s = 0; - trans_for_each_entry(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - /* - * bch2_btree_node_insert_fits() must be called under write lock: - * with only an intent lock, another thread can still call - * bch2_btree_node_write(), converting an unwritten bset to a - * written one - */ - if (!i->done) { - u64s += i->k->k.u64s + i->extra_res; - if (!bch2_btree_node_insert_fits(c, - i->iter->l[0].b, u64s)) { - split = i->iter; - goto unlock; - } - } - } - - ret = 0; split = NULL; cycle_gc_lock = false; trans_for_each_entry(trans, i) { - if (i->done) - continue; - - switch (btree_insert_key_leaf(trans, i)) { - case BTREE_INSERT_OK: - i->done = true; - break; - case BTREE_INSERT_JOURNAL_RES_FULL: - case BTREE_INSERT_NEED_TRAVERSE: + if (!bch2_btree_iter_upgrade(i->iter, 1)) { ret = -EINTR; - break; - case BTREE_INSERT_NEED_RESCHED: - ret = -EAGAIN; - break; - case BTREE_INSERT_BTREE_NODE_FULL: - split = i->iter; - break; - case BTREE_INSERT_ENOSPC: - ret = -ENOSPC; - break; - case BTREE_INSERT_NEED_GC_LOCK: - cycle_gc_lock = true; - ret = -EINTR; - break; - default: - BUG(); + goto err; } - if (!trans->did_work && (ret || split)) - break; + if (i->iter->flags & BTREE_ITER_ERROR) { + ret = -EIO; + goto err; + } } -unlock: - multi_unlock_write(trans); - bch2_journal_res_put(&c->journal, &trans->journal_res); - if (split) - goto split; - if (ret) + ret = do_btree_insert_at(trans, &split, &cycle_gc_lock); + if (unlikely(ret)) goto err; + trans_for_each_leaf(trans, i) + bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); + trans_for_each_entry(trans, i) - if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF) - goto out; - - trans_for_each_entry(trans, i) { - /* - * iterators are inconsistent when they hit end of leaf, until - * traversed again - */ - if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE && - !same_leaf_as_prev(trans, i)) - bch2_foreground_maybe_merge(c, i->iter, 0); - } + bch2_btree_iter_downgrade(i->iter); out: - /* make sure we didn't lose an error: */ - if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_entry(trans, i) - BUG_ON(!i->done); - percpu_ref_put(&c->writes); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + /* make sure we didn't drop or screw up locks: */ + for_each_btree_iter(trans->entries[0].iter, linked) { + bch2_btree_iter_verify_locks(linked); + BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) && + trans->did_work && + linked->uptodate >= BTREE_ITER_NEED_RELOCK); + } + + /* make sure we didn't lose an error: */ + if (!ret) + trans_for_each_entry(trans, i) + BUG_ON(!i->done); + } + + BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); + return ret; -split: - /* - * have to drop journal res before splitting, because splitting means - * allocating new btree nodes, and holding a journal reservation - * potentially blocks the allocator: - */ - ret = bch2_btree_split_leaf(c, split, trans->flags); - - /* - * This can happen when we insert part of an extent - with an update - * with multiple keys, we don't want to redo the entire update - that's - * just too confusing: - */ - if (!ret && - (trans->flags & BTREE_INSERT_ATOMIC) && - trans->did_work) - ret = -EINTR; - - if (ret) - goto err; - - /* - * if the split didn't have to drop locks the insert will still be - * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() - * and is overwriting won't have changed) - */ - goto retry_locks; err: + flags = trans->flags; + + /* + * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree + * update; if we haven't done anything yet it doesn't apply + */ + if (!trans->did_work) + flags &= ~BTREE_INSERT_NOUNLOCK; + + if (split) { + ret = bch2_btree_split_leaf(c, split, flags); + + /* + * if the split succeeded without dropping locks the insert will + * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the + * caller peeked() and is overwriting won't have changed) + */ +#if 0 + /* + * XXX: + * split -> btree node merging (of parent node) might still drop + * locks when we're not passing it BTREE_INSERT_NOUNLOCK + */ + if (!ret && !trans->did_work) + goto retry; +#endif + + /* + * don't care if we got ENOSPC because we told split it + * couldn't block: + */ + if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) + ret = -EINTR; + } + if (cycle_gc_lock) { - down_read(&c->gc_lock); + if (!down_read_trylock(&c->gc_lock)) { + if (flags & BTREE_INSERT_NOUNLOCK) + goto out; + + bch2_btree_iter_unlock(trans->entries[0].iter); + down_read(&c->gc_lock); + } up_read(&c->gc_lock); } if (ret == -EINTR) { + if (flags & BTREE_INSERT_NOUNLOCK) + goto out; + trans_for_each_entry(trans, i) { int ret2 = bch2_btree_iter_traverse(i->iter); if (ret2) { ret = ret2; goto out; } + + BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); } /* * BTREE_ITER_ATOMIC means we have to return -EINTR if we * dropped locks: */ - if (!(trans->flags & BTREE_INSERT_ATOMIC)) + if (!(flags & BTREE_INSERT_ATOMIC)) goto retry; } @@ -549,7 +612,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, - BTREE_INSERT_ENTRY(&iter, k)); + BTREE_INSERT_ENTRY(&iter, k)); bch2_btree_iter_unlock(&iter); return ret; @@ -584,6 +647,11 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, if (bkey_cmp(iter.pos, end) >= 0) break; + if (k.k->type == KEY_TYPE_DISCARD) { + bch2_btree_iter_next(&iter); + continue; + } + bkey_init(&delete.k); /* @@ -615,8 +683,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, } ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &delete)); + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &delete)); if (ret) break; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index b17189ee..43112445 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -358,8 +358,9 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, old.data_type != new.data_type) { BUG_ON(!c); bch2_fs_inconsistent(c, - "different types of data in same bucket: %u, %u", - old.data_type, new.data_type); + "different types of data in same bucket: %s, %s", + bch2_data_types[old.data_type], + bch2_data_types[new.data_type]); } dev_usage = this_cpu_ptr(ca->usage_percpu); diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 2690cc4b..031b36f3 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -109,14 +109,6 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c, return true; } -static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -}; - /* returns true if not equal */ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) { diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index df9913f8..36dca6b2 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -12,7 +12,8 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { - unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent); + unsigned len = bkey_val_bytes(d.k) - + offsetof(struct bch_dirent, d_name); while (len && !d.v->d_name[len - 1]) --len; @@ -22,7 +23,8 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) static unsigned dirent_val_u64s(unsigned len) { - return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64)); + return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, + sizeof(u64)); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -98,7 +100,7 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) return "value too big"; - if (len > NAME_MAX) + if (len > BCH_NAME_MAX) return "dirent name too big"; if (memchr(d.v->d_name, '/', len)) @@ -141,9 +143,14 @@ static struct bkey_i_dirent *dirent_create_key(u8 type, struct bkey_i_dirent *dirent; unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + BUG_ON(u64s > U8_MAX); + dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS); if (!dirent) - return NULL; + return ERR_PTR(-ENOMEM); bkey_dirent_init(&dirent->k_i); dirent->k.u64s = u64s; @@ -153,7 +160,8 @@ static struct bkey_i_dirent *dirent_create_key(u8 type, memcpy(dirent->v.d_name, name->name, name->len); memset(dirent->v.d_name + name->len, 0, bkey_val_bytes(&dirent->k) - - (sizeof(struct bch_dirent) + name->len)); + offsetof(struct bch_dirent, d_name) - + name->len); EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); @@ -169,8 +177,8 @@ int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, int ret; dirent = dirent_create_key(type, name, dst_inum); - if (!dirent) - return -ENOMEM; + if (IS_ERR(dirent)) + return PTR_ERR(dirent); ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum, journal_seq, &dirent->k_i, flags); @@ -204,7 +212,7 @@ int bch2_dirent_rename(struct bch_fs *c, struct bpos src_pos = bch2_dirent_pos(src_dir, src_name); struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name); bool need_whiteout; - int ret = -ENOMEM; + int ret; bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -218,15 +226,19 @@ int bch2_dirent_rename(struct bch_fs *c, if (mode == BCH_RENAME_EXCHANGE) { new_src = dirent_create_key(0, src_name, 0); - if (!new_src) + if (IS_ERR(new_src)) { + ret = PTR_ERR(new_src); goto err; + } } else { new_src = (void *) &delete; } new_dst = dirent_create_key(0, dst_name, 0); - if (!new_dst) + if (IS_ERR(new_dst)) { + ret = PTR_ERR(new_dst); goto err; + } retry: /* * Note that on -EINTR/dropped locks we're not restarting the lookup diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index d7b17195..737b9be3 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -257,12 +257,12 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) int ret; mutex_lock(&h->inode->ei_update_lock); - if (h->new_i_size != U64_MAX) - i_size_write(&h->inode->v, h->new_i_size); - i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h); + + if (!ret && h->new_i_size != U64_MAX) + i_size_write(&h->inode->v, h->new_i_size); mutex_unlock(&h->inode->ei_update_lock); bch2_quota_reservation_put(c, h->inode, &h->quota_res); @@ -348,17 +348,25 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, return BTREE_INSERT_NEED_TRAVERSE; } - BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY); + /* truncate in progress? */ + if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) + goto no_i_size_update; h->inode_u.bi_size = offset; do_pack = true; inode->ei_inode.bi_size = offset; - if (h->op->is_dio) - i_size_write(&inode->v, offset); + spin_lock(&inode->v.i_lock); + if (offset > inode->v.i_size) { + if (h->op->is_dio) + i_size_write(&inode->v, offset); + else + BUG(); + } + spin_unlock(&inode->v.i_lock); } - +no_i_size_update: if (sectors) { if (!h->need_inode_update) { h->need_inode_update = true; @@ -1457,8 +1465,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping, copied = 0; } + spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); if (copied) { if (!PageUptodate(page)) @@ -1563,8 +1573,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); inode->ei_last_dirtied = (unsigned long) current; + spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); if (copied < len && ((offset + copied) & (PAGE_SIZE - 1))) { @@ -2047,10 +2059,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret; - ret = filemap_write_and_wait_range(inode->v.i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; + if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) + goto out; + + ret = sync_inode_metadata(&inode->v, 1); + if (ret) + return ret; +out: if (c->opts.journal_flush_disabled) return 0; @@ -2149,25 +2168,61 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, from + PAGE_SIZE); } +static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + int ret; + + ret = filemap_write_and_wait_range(mapping, + inode->ei_inode.bi_size, S64_MAX); + if (ret) + return ret; + + truncate_setsize(&inode->v, iattr->ia_size); + setattr_copy(&inode->v, iattr); + + mutex_lock(&inode->ei_update_lock); + inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v); + ret = bch2_write_inode_size(c, inode, inode->v.i_size); + mutex_unlock(&inode->ei_update_lock); + + return ret; +} + int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - bool shrink = iattr->ia_size <= inode->v.i_size; struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY); + bool shrink; int ret = 0; inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - truncate_setsize(&inode->v, iattr->ia_size); + BUG_ON(inode->v.i_size < inode->ei_inode.bi_size); + + shrink = iattr->ia_size <= inode->v.i_size; + + if (!shrink) { + ret = bch2_extend(inode, iattr); + goto err_put_pagecache; + } + + ret = bch2_truncate_page(inode, iattr->ia_size); + if (unlikely(ret)) + goto err_put_pagecache; - /* sync appends.. */ - /* XXX what protects inode->i_size? */ if (iattr->ia_size > inode->ei_inode.bi_size) ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); + inode->ei_inode.bi_size, + iattr->ia_size - 1); + else if (iattr->ia_size & (PAGE_SIZE - 1)) + ret = filemap_write_and_wait_range(mapping, + round_down(iattr->ia_size, PAGE_SIZE), + iattr->ia_size - 1); if (ret) goto err_put_pagecache; @@ -2175,41 +2230,31 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ret = i_sectors_dirty_start(c, &i_sectors_hook); if (unlikely(ret)) - goto err; + goto err_put_pagecache; - /* - * There might be persistent reservations (from fallocate()) - * above i_size, which bch2_inode_truncate() will discard - we're - * only supposed to discard them if we're doing a real truncate - * here (new i_size < current i_size): - */ - if (shrink) { - ret = bch2_truncate_page(inode, iattr->ia_size); - if (unlikely(ret)) - goto err; + truncate_setsize(&inode->v, iattr->ia_size); - ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &inode->ei_journal_seq); - if (unlikely(ret)) - goto err; - } + ret = bch2_inode_truncate(c, inode->v.i_ino, + round_up(iattr->ia_size, PAGE_SIZE) >> 9, + &i_sectors_hook.hook, + &inode->ei_journal_seq); + if (unlikely(ret)) + goto err_put_sectors_dirty; setattr_copy(&inode->v, iattr); inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v); -err: - /* - * On error - in particular, bch2_truncate_page() error - don't clear - * I_SIZE_DIRTY, as we've left data above i_size!: - */ - if (ret) - i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY; - +out: ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err_put_pagecache: pagecache_block_put(&mapping->add_lock); return ret; +err_put_sectors_dirty: + /* + * On error - in particular, bch2_truncate_page() error - don't clear + * I_SIZE_DIRTY, as we've left data above i_size!: + */ + i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY; + goto out; } /* fallocate: */ @@ -2389,7 +2434,6 @@ btree_iter_err: if (ret) goto err_put_sectors_dirty; - i_size_write(&inode->v, new_size); i_sectors_hook.new_i_size = new_size; err_put_sectors_dirty: ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index dc6c651d..3b7f78e7 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -106,6 +106,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c, break; } + BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size); + if (set) { ret = set(inode, &inode_u, p); if (ret) @@ -114,6 +116,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c, BUG_ON(i_nlink < nlink_bias(inode->v.i_mode)); + BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size && + !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + inode_u.bi_size > i_size_read(&inode->v)); + inode_u.bi_mode = inode->v.i_mode; inode_u.bi_uid = i_uid_read(&inode->v); inode_u.bi_gid = i_gid_read(&inode->v); @@ -129,11 +135,17 @@ int __must_check __bch2_write_inode(struct bch_fs *c, ret = bch2_btree_insert_at(c, NULL, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); } while (ret == -EINTR); if (!ret) { + /* + * the btree node lock protects inode->ei_inode, not + * ei_update_lock; this is important for inode updates via + * bchfs_write_index_update + */ inode->ei_inode = inode_u; inode->ei_qid = bch_qid(&inode_u); } @@ -1107,7 +1119,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; - buf->f_namelen = NAME_MAX; + buf->f_namelen = BCH_NAME_MAX; return 0; } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index addd51f0..b4fe27f8 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -75,6 +75,19 @@ static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf) return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); } +static inline bool journal_entry_empty(struct jset *j) +{ + struct jset_entry *i; + + if (j->seq != j->last_seq) + return false; + + vstruct_for_each(j, i) + if (i->type || i->u64s) + return false; + return true; +} + static enum { JOURNAL_ENTRY_ERROR, JOURNAL_ENTRY_INUSE, @@ -129,6 +142,11 @@ static enum { /* XXX: why set this here, and not in bch2_journal_write()? */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + if (journal_entry_empty(buf->data)) + clear_bit(JOURNAL_NOT_EMPTY, &j->flags); + else + set_bit(JOURNAL_NOT_EMPTY, &j->flags); + journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); @@ -884,8 +902,18 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + wait_event(j->wait, journal_flush_write(j)); + /* do we need to write another journal entry? */ + if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) || + c->btree_roots_dirty) + bch2_journal_meta(j); + + BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_NOT_EMPTY, &j->flags)); + cancel_delayed_work_sync(&j->write_work); cancel_delayed_work_sync(&j->reclaim_work); } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 36ba6a4d..8a4e7b2a 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -13,37 +13,6 @@ #include -static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, - enum btree_id id) -{ - struct jset_entry *entry; - - for_each_jset_entry_type(entry, j, type) - if (entry->btree_id == id) - return entry; - - return NULL; -} - -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry = - bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id); - - if (!entry) - return NULL; - - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - *level = entry->level; - return k; -} - struct journal_list { struct closure cl; struct mutex lock; @@ -717,6 +686,37 @@ void bch2_journal_entries_free(struct list_head *list) } } +int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq) +{ + struct journal *j = &c->journal; + struct journal_entry_pin_list *p; + u64 seq, nr = end_seq - last_seq + 1; + + if (nr > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -ENOMEM; + } + } + + atomic64_set(&j->seq, end_seq); + j->last_seq_ondisk = last_seq; + + j->pin.front = last_seq; + j->pin.back = end_seq + 1; + + fifo_for_each_entry_ptr(p, &j->pin, seq) { + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, 0); + p->devs.nr = 0; + } + + return 0; +} + int bch2_journal_read(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; @@ -724,10 +724,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_replay *i; struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq, seq; + u64 cur_seq, end_seq; unsigned iter; - size_t entries = 0; - u64 nr, keys = 0; + size_t keys = 0, entries = 0; bool degraded = false; int ret = 0; @@ -783,43 +782,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) } } - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; - - for_each_jset_key(k, _n, entry, &i->j) - keys++; - } - i = list_last_entry(list, struct journal_replay, list); - nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; - - fsck_err_on(c->sb.clean && (keys || nr > 1), c, - "filesystem marked clean but journal not empty (%llu keys in %llu entries)", - keys, nr); - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); - j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); - - j->pin.front = le64_to_cpu(i->j.last_seq); - j->pin.back = le64_to_cpu(i->j.seq) + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } + ret = bch2_journal_set_seq(c, + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq)); + if (ret) + return ret; mutex_lock(&j->blacklist_lock); @@ -842,6 +811,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_replay, list)->j.seq); list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; bool blacklisted; mutex_lock(&j->blacklist_lock); @@ -863,10 +834,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) journal_last_seq(j), end_seq); cur_seq = le64_to_cpu(i->j.seq) + 1; + + for_each_jset_key(k, _n, entry, &i->j) + keys++; entries++; } - bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu", + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", keys, entries, journal_cur_seq(j)); fsck_err: return ret; @@ -950,7 +924,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); - ret = bch2_journal_flush_all_pins(j); + bch2_journal_flush_all_pins(j); + ret = bch2_journal_error(j); err: bch2_journal_entries_free(list); return ret; diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 4236b7fc..e303df92 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -1,9 +1,6 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, - enum btree_id, unsigned *); - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration @@ -37,6 +34,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) +int bch2_journal_set_seq(struct bch_fs *c, u64, u64); int bch2_journal_read(struct bch_fs *, struct list_head *); int bch2_journal_entry_sectors(struct journal *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 0e3e5b6a..394b72bb 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -337,34 +337,22 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, return ret; } -int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) +void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin *pin; u64 pin_seq; - bool flush; if (!test_bit(JOURNAL_STARTED, &j->flags)) - return 0; -again: - wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq)); - if (pin) { - /* flushing a journal pin might cause a new one to be added: */ + return; + + while (1) { + wait_event(j->wait, journal_flush_done(j, seq_to_flush, + &pin, &pin_seq)); + if (!pin) + break; + pin->flush(j, pin, pin_seq); - goto again; } - - spin_lock(&j->lock); - flush = journal_last_seq(j) != j->last_seq_ondisk || - (seq_to_flush == U64_MAX && c->btree_roots_dirty); - spin_unlock(&j->lock); - - return flush ? bch2_journal_meta(j) : 0; -} - -int bch2_journal_flush_all_pins(struct journal *j) -{ - return bch2_journal_flush_pins(j, U64_MAX); } int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) @@ -383,7 +371,9 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = iter; spin_unlock(&j->lock); - ret = bch2_journal_flush_pins(j, seq); + bch2_journal_flush_pins(j, seq); + + ret = bch2_journal_error(j); if (ret) return ret; @@ -404,7 +394,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) } spin_unlock(&j->lock); - bch2_replicas_gc_end(c, ret); + ret = bch2_replicas_gc_end(c, ret); mutex_unlock(&c->replicas_gc_lock); return ret; diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 7d460c35..eb227902 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -29,8 +29,13 @@ void bch2_journal_pin_add_if_older(struct journal *, void bch2_journal_reclaim_fast(struct journal *); void bch2_journal_reclaim_work(struct work_struct *); -int bch2_journal_flush_pins(struct journal *, u64); -int bch2_journal_flush_all_pins(struct journal *); +void bch2_journal_flush_pins(struct journal *, u64); + +static inline void bch2_journal_flush_all_pins(struct journal *j) +{ + bch2_journal_flush_pins(j, U64_MAX); +} + int bch2_journal_flush_device_pins(struct journal *, int); #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index a27e0548..effbeece 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -117,6 +117,7 @@ enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, JOURNAL_NEED_WRITE, + JOURNAL_NOT_EMPTY, }; /* Embedded in struct bch_fs */ diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index ea519102..215c5aa5 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -126,7 +126,13 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) retry: if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key), dev_idx)) { - bch2_btree_iter_set_locks_want(&iter, 0); + /* + * we might have found a btree node key we + * needed to update, and then tried to update it + * but got -EINTR after upgrading the iter, but + * then raced and the node is now gone: + */ + bch2_btree_iter_downgrade(&iter); ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, bkey_i_to_s_c(&b->key)); @@ -141,11 +147,6 @@ retry: if (ret) goto err; - if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) { - b = bch2_btree_iter_peek_node(&iter); - goto retry; - } - ret = bch2_btree_node_update_key(c, &iter, b, new_key); if (ret == -EINTR) { b = bch2_btree_iter_peek_node(&iter); @@ -160,7 +161,7 @@ retry: ret = 0; out: - bch2_replicas_gc_end(c, ret); + ret = bch2_replicas_gc_end(c, ret); mutex_unlock(&c->replicas_gc_lock); return ret; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index e7ab8870..f476033e 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -137,6 +137,9 @@ enum opt_type { BCH_OPT(degraded, u8, OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false) \ + BCH_OPT(discard, u8, OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false) \ BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false) \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c new file mode 100644 index 00000000..58aee7ae --- /dev/null +++ b/libbcachefs/recovery.c @@ -0,0 +1,346 @@ + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "error.h" +#include "fsck.h" +#include "journal_io.h" +#include "quota.h" +#include "recovery.h" +#include "super-io.h" + +#include + +struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j) +{ + unsigned i; + int ret = 0; + + if (!clean || !j) + return 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) + bch2_fs_mark_clean(c, false); + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, + "superblock btree root doesn't match journal after clean shutdown"); + } +fsck_err: + return ret; +} + +static bool journal_empty(struct list_head *journal) +{ + struct journal_replay *i; + struct jset_entry *entry; + + if (list_empty(journal)) + return true; + + i = list_last_entry(journal, struct journal_replay, list); + + if (i->j.last_seq != i->j.seq) + return false; + + list_for_each_entry(i, journal, list) { + vstruct_for_each(&i->j, entry) { + if (entry->type == BCH_JSET_ENTRY_btree_root) + continue; + + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + return false; + } + } + + return true; +} + +int bch2_fs_recovery(struct bch_fs *c) +{ + const char *err = "cannot allocate memory"; + struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL; + LIST_HEAD(journal); + struct jset *j = NULL; + unsigned i; + int ret; + + mutex_lock(&c->sb_lock); + if (!bch2_sb_get_replicas(c->disk_sb.sb)) { + bch_info(c, "building replicas info"); + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + + if (c->sb.clean) + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + if (sb_clean) { + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + ret = -ENOMEM; + mutex_unlock(&c->sb_lock); + goto err; + } + } + mutex_unlock(&c->sb_lock); + + if (clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + + if (!clean || !c->opts.nofsck) { + ret = bch2_journal_read(c, &journal); + if (ret) + goto err; + + j = &list_entry(journal.prev, struct journal_replay, list)->j; + } else { + ret = bch2_journal_set_seq(c, + le64_to_cpu(clean->journal_seq), + le64_to_cpu(clean->journal_seq)); + BUG_ON(ret); + } + + ret = verify_superblock_clean(c, clean, j); + if (ret) + goto err; + + fsck_err_on(clean && !journal_empty(&journal), c, + "filesystem marked clean but journal not empty"); + + if (clean) { + c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); + } else { + c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); + } + + for (i = 0; i < BTREE_ID_NR; i++) { + unsigned level; + struct bkey_i *k; + + k = btree_root_find(c, clean, j, i, &level); + if (!k) + continue; + + err = "invalid btree root pointer"; + if (IS_ERR(k)) + goto err; + + err = "error reading btree root"; + if (bch2_btree_root_read(c, i, k, level)) { + if (i != BTREE_ID_ALLOC) + goto err; + + mustfix_fsck_err(c, "error reading btree root"); + } + } + + for (i = 0; i < BTREE_ID_NR; i++) + if (!c->btree_roots[i].b) + bch2_btree_root_alloc(c, i); + + err = "error reading allocation information"; + ret = bch2_alloc_read(c, &journal); + if (ret) + goto err; + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + + bch_verbose(c, "starting mark and sweep:"); + err = "error in recovery"; + ret = bch2_initial_gc(c, &journal); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); + + if (c->opts.noreplay) + goto out; + + /* + * Mark dirty before journal replay, fsck: + * XXX: after a clean shutdown, this could be done lazily only when fsck + * finds an error + */ + bch2_fs_mark_clean(c, false); + + /* + * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() + * will give spurious errors about oldest_gen > bucket_gen - + * this is a hack but oh well. + */ + bch2_fs_journal_start(&c->journal); + + err = "error starting allocator"; + if (bch2_fs_allocator_start(c)) + goto err; + + bch_verbose(c, "starting journal replay:"); + err = "journal replay failed"; + ret = bch2_journal_replay(c, &journal); + if (ret) + goto err; + bch_verbose(c, "journal replay done"); + + if (c->opts.norecovery) + goto out; + + bch_verbose(c, "starting fsck:"); + err = "error in fsck"; + ret = bch2_fsck(c, !c->opts.nofsck); + if (ret) + goto err; + bch_verbose(c, "fsck done"); + + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas:"); + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + bch_verbose(c, "quotas done"); + } + +out: + bch2_journal_entries_free(&journal); + kfree(clean); + return ret; +err: +fsck_err: + BUG_ON(!ret); + goto out; +} + +int bch2_fs_initialize(struct bch_fs *c) +{ + struct bch_inode_unpacked inode; + struct bkey_inode_buf packed_inode; + const char *err = "cannot allocate memory"; + struct bch_dev *ca; + LIST_HEAD(journal); + unsigned i; + int ret; + + bch_notice(c, "initializing new filesystem"); + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + + ret = bch2_initial_gc(c, &journal); + if (ret) + goto err; + + err = "unable to allocate journal buckets"; + for_each_online_member(ca, c, i) + if (bch2_dev_journal_alloc(ca)) { + percpu_ref_put(&ca->io_ref); + goto err; + } + + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + bch2_fs_journal_start(&c->journal); + bch2_journal_set_replay_done(&c->journal); + + err = "error starting allocator"; + if (bch2_fs_allocator_start(c)) + goto err; + + bch2_inode_init(c, &inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + inode.bi_inum = BCACHEFS_ROOT_INO; + + bch2_inode_pack(&packed_inode, &inode); + + err = "error creating root directory"; + if (bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, + NULL, NULL, NULL, 0)) + goto err; + + if (enabled_qtypes(c)) { + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + } + + err = "error writing first journal entry"; + if (bch2_journal_meta(&c->journal)) + goto err; + + mutex_lock(&c->sb_lock); + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +err: + BUG_ON(!ret); + return ret; +} diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h new file mode 100644 index 00000000..685507e8 --- /dev/null +++ b/libbcachefs/recovery.h @@ -0,0 +1,7 @@ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H + +int bch2_fs_recovery(struct bch_fs *); +int bch2_fs_initialize(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 6c52d1d4..1e94d35f 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -215,10 +215,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, return 0; err: mutex_unlock(&c->sb_lock); - if (new_gc) - kfree(new_gc); - if (new_r) - kfree(new_r); + kfree(new_gc); + kfree(new_r); return ret; } @@ -265,10 +263,9 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k)); } -int bch2_replicas_gc_end(struct bch_fs *c, int err) +int bch2_replicas_gc_end(struct bch_fs *c, int ret) { struct bch_replicas_cpu *new_r, *old_r; - int ret = 0; lockdep_assert_held(&c->replicas_gc_lock); @@ -276,29 +273,31 @@ int bch2_replicas_gc_end(struct bch_fs *c, int err) new_r = rcu_dereference_protected(c->replicas_gc, lockdep_is_held(&c->sb_lock)); + rcu_assign_pointer(c->replicas_gc, NULL); - if (err) { - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(new_r, rcu); + if (ret) goto err; - } if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { ret = -ENOSPC; goto err; } + bch2_write_super(c); + + /* don't update in memory replicas until changes are persistent */ + old_r = rcu_dereference_protected(c->replicas, lockdep_is_held(&c->sb_lock)); rcu_assign_pointer(c->replicas, new_r); - rcu_assign_pointer(c->replicas_gc, NULL); kfree_rcu(old_r, rcu); - - bch2_write_super(c); -err: +out: mutex_unlock(&c->sb_lock); return ret; +err: + kfree_rcu(new_r, rcu); + goto out; } int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index f7dd0144..c8051095 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -237,6 +237,7 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, { struct bkey_s_c k; + bch2_btree_iter_copy(iter, start); bch2_btree_iter_next_slot(iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 9772d597..54de9fac 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -4,6 +4,7 @@ #include "disk_groups.h" #include "error.h" #include "io.h" +#include "journal.h" #include "replicas.h" #include "quota.h" #include "super-io.h" @@ -89,6 +90,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) struct bch_sb *new_sb; struct bio *bio; + if (sb->sb && sb->page_order >= order) + return 0; + if (sb->have_layout) { u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; @@ -849,6 +853,84 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { .validate = bch2_sb_validate_crypt, }; +/* BCH_SB_FIELD_clean: */ + +void bch2_fs_mark_clean(struct bch_fs *c, bool clean) +{ + struct bch_sb_field_clean *sb_clean; + unsigned u64s = sizeof(*sb_clean) / sizeof(u64); + struct jset_entry *entry; + struct btree_root *r; + + mutex_lock(&c->sb_lock); + if (clean == BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, clean); + + if (!clean) + goto write_super; + + mutex_lock(&c->btree_root_lock); + + for (r = c->btree_roots; + r < c->btree_roots + BTREE_ID_NR; + r++) + if (r->alive) + u64s += jset_u64s(r->key.u64s); + + sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); + sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); + sb_clean->journal_seq = journal_cur_seq(&c->journal) - 1; + + entry = sb_clean->start; + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + for (r = c->btree_roots; + r < c->btree_roots + BTREE_ID_NR; + r++) + if (r->alive) { + entry->u64s = r->key.u64s; + entry->btree_id = r - c->btree_roots; + entry->level = r->level; + entry->type = BCH_JSET_ENTRY_btree_root; + bkey_copy(&entry->start[0], &r->key); + entry = vstruct_next(entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + } + + BUG_ON(entry != vstruct_end(&sb_clean->field)); + + mutex_unlock(&c->btree_root_lock); +write_super: + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} + +static const char *bch2_sb_validate_clean(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) + return "invalid field crypt: wrong size"; + + return NULL; +} + +static const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_validate_clean, +}; + static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 995b1c90..7d09d8e4 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -131,6 +131,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) }; } +/* BCH_SB_FIELD_clean: */ + +void bch2_fs_mark_clean(struct bch_fs *, bool); + size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *, struct bch_sb_field *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 1eab7c77..a2a32b92 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -10,7 +10,6 @@ #include "alloc.h" #include "btree_cache.h" #include "btree_gc.h" -#include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "chardev.h" @@ -26,14 +25,13 @@ #include "inode.h" #include "io.h" #include "journal.h" -#include "journal_io.h" #include "journal_reclaim.h" -#include "keylist.h" #include "move.h" #include "migrate.h" #include "movinggc.h" #include "quota.h" #include "rebalance.h" +#include "recovery.h" #include "replicas.h" #include "super.h" #include "super-io.h" @@ -201,18 +199,6 @@ int bch2_congested(void *data, int bdi_bits) * - allocator depends on the journal (when it rewrites prios and gens) */ -static void bch_fs_mark_clean(struct bch_fs *c) -{ - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } -} - static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; @@ -229,7 +215,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: */ - bch2_journal_flush_pins(&c->journal, U64_MAX - 1); + bch2_journal_flush_all_pins(&c->journal); for_each_member_device(ca, c, i) bch2_dev_allocator_stop(ca); @@ -246,9 +232,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); - if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - bch2_btree_verify_flushed(c); - bch2_fs_journal_stop(&c->journal); /* @@ -257,6 +240,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) */ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) bch2_btree_flush_all_writes(c); + else + bch2_btree_verify_flushed(c); /* * After stopping journal: @@ -275,12 +260,10 @@ static void bch2_writes_disabled(struct percpu_ref *writes) void bch2_fs_read_only(struct bch_fs *c) { - if (c->state != BCH_FS_STARTING && - c->state != BCH_FS_RW) + if (c->state == BCH_FS_RO) return; - if (test_bit(BCH_FS_ERROR, &c->flags)) - return; + BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); /* * Block new foreground-end write operations from starting - any new @@ -311,13 +294,18 @@ void bch2_fs_read_only(struct bch_fs *c) __bch2_fs_read_only(c); - bch_fs_mark_clean(c); - wait_event(bch_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - c->state = BCH_FS_RO; + + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + bch2_fs_mark_clean(c, true); + + if (c->state != BCH_FS_STOPPING) + c->state = BCH_FS_RO; } static void bch2_fs_read_only_work(struct work_struct *work) @@ -352,10 +340,11 @@ const char *bch2_fs_read_write(struct bch_fs *c) const char *err = NULL; unsigned i; - if (c->state != BCH_FS_STARTING && - c->state != BCH_FS_RO) + if (c->state == BCH_FS_RW) return NULL; + bch2_fs_mark_clean(c, false); + for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); @@ -446,11 +435,6 @@ void bch2_fs_stop(struct bch_fs *c) struct bch_dev *ca; unsigned i; - mutex_lock(&c->state_lock); - BUG_ON(c->state == BCH_FS_STOPPING); - c->state = BCH_FS_STOPPING; - mutex_unlock(&c->state_lock); - for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) @@ -475,11 +459,9 @@ void bch2_fs_stop(struct bch_fs *c) closure_debug_destroy(&c->cl); mutex_lock(&c->state_lock); - __bch2_fs_read_only(c); + bch2_fs_read_only(c); mutex_unlock(&c->state_lock); - bch_fs_mark_clean(c); - /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); @@ -695,9 +677,7 @@ const char *bch2_fs_start(struct bch_fs *c) const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; struct bch_dev *ca; - LIST_HEAD(journal); - struct jset *j; - time64_t now; + time64_t now = ktime_get_seconds(); unsigned i; int ret = -EINVAL; @@ -706,157 +686,26 @@ const char *bch2_fs_start(struct bch_fs *c) BUG_ON(c->state != BCH_FS_STARTING); mutex_lock(&c->sb_lock); + for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); + + mi = bch2_sb_get_members(c->disk_sb.sb); + for_each_online_member(ca, c, i) + mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); + mutex_unlock(&c->sb_lock); for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (BCH_SB_INITIALIZED(c->disk_sb.sb)) { - ret = bch2_journal_read(c, &journal); - if (ret) - goto err; + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); + if (ret) + goto err; - j = &list_entry(journal.prev, struct journal_replay, list)->j; - - c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); - - for (i = 0; i < BTREE_ID_NR; i++) { - unsigned level; - struct bkey_i *k; - - k = bch2_journal_find_btree_root(c, j, i, &level); - if (!k) - continue; - - err = "invalid btree root pointer"; - if (IS_ERR(k)) - goto err; - - err = "error reading btree root"; - if (bch2_btree_root_read(c, i, k, level)) { - if (i != BTREE_ID_ALLOC) - goto err; - - mustfix_fsck_err(c, "error reading btree root"); - } - } - - for (i = 0; i < BTREE_ID_NR; i++) - if (!c->btree_roots[i].b) - bch2_btree_root_alloc(c, i); - - err = "error reading allocation information"; - ret = bch2_alloc_read(c, &journal); - if (ret) - goto err; - - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - - bch_verbose(c, "starting mark and sweep:"); - err = "error in recovery"; - ret = bch2_initial_gc(c, &journal); - if (ret) - goto err; - bch_verbose(c, "mark and sweep done"); - - if (c->opts.noreplay) - goto recovery_done; - - /* - * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() - * will give spurious errors about oldest_gen > bucket_gen - - * this is a hack but oh well. - */ - bch2_fs_journal_start(&c->journal); - - err = "error starting allocator"; - if (bch2_fs_allocator_start(c)) - goto err; - - bch_verbose(c, "starting journal replay:"); - err = "journal replay failed"; - ret = bch2_journal_replay(c, &journal); - if (ret) - goto err; - bch_verbose(c, "journal replay done"); - - if (c->opts.norecovery) - goto recovery_done; - - bch_verbose(c, "starting fsck:"); - err = "error in fsck"; - ret = bch2_fsck(c, !c->opts.nofsck); - if (ret) - goto err; - bch_verbose(c, "fsck done"); - - if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas:"); - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - bch_verbose(c, "quotas done"); - } - } else { - struct bch_inode_unpacked inode; - struct bkey_inode_buf packed_inode; - - bch_notice(c, "initializing new filesystem"); - - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - - ret = bch2_initial_gc(c, &journal); - if (ret) - goto err; - - err = "unable to allocate journal buckets"; - for_each_online_member(ca, c, i) - if (bch2_dev_journal_alloc(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - for (i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - bch2_fs_journal_start(&c->journal); - bch2_journal_set_replay_done(&c->journal); - - err = "error starting allocator"; - if (bch2_fs_allocator_start(c)) - goto err; - - bch2_inode_init(c, &inode, 0, 0, - S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); - inode.bi_inum = BCACHEFS_ROOT_INO; - - bch2_inode_pack(&packed_inode, &inode); - - err = "error creating root directory"; - if (bch2_btree_insert(c, BTREE_ID_INODES, - &packed_inode.inode.k_i, - NULL, NULL, NULL, 0)) - goto err; - - if (enabled_qtypes(c)) { - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - } - - err = "error writing first journal entry"; - if (bch2_journal_meta(&c->journal)) - goto err; - } -recovery_done: err = "dynamic fault"; if (bch2_fs_init_fault("fs_start")) goto err; @@ -869,28 +718,13 @@ recovery_done: goto err; } - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); - now = ktime_get_seconds(); - - for_each_member_device(ca, c, i) - mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); - - SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - set_bit(BCH_FS_STARTED, &c->flags); err = NULL; out: mutex_unlock(&c->state_lock); - bch2_journal_entries_free(&journal); return err; err: -fsck_err: switch (ret) { case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); @@ -1091,6 +925,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; + if (opt_defined(c->opts, discard)) + ca->mi.discard = opt_get(c->opts, discard); + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, @@ -1454,7 +1291,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * must flush all existing journal entries, they might have * (overwritten) keys that point to the device we're removing: */ - ret = bch2_journal_flush_all_pins(&c->journal); + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_error(&c->journal); if (ret) { bch_err(ca, "Remove failed, journal error"); goto err; @@ -1615,6 +1453,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; + struct bch_sb_field_members *mi; struct bch_dev *ca; unsigned dev_idx; const char *err; @@ -1646,6 +1485,15 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + + mi->members[ca->dev_idx].last_mount = + cpu_to_le64(ktime_get_seconds()); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + mutex_unlock(&c->state_lock); return 0; err: diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 66b5b9f9..4987ee76 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -27,6 +27,7 @@ #include "rebalance.h" #include "replicas.h" #include "super-io.h" +#include "tests.h" #include #include @@ -192,6 +193,10 @@ rw_attribute(pd_controllers_update_seconds); read_attribute(meta_replicas_have); read_attribute(data_replicas_have); +#ifdef CONFIG_BCACHEFS_TESTS +write_attribute(perf_test); +#endif /* CONFIG_BCACHEFS_TESTS */ + #define BCH_DEBUG_PARAM(name, description) \ rw_attribute(name); @@ -446,7 +451,25 @@ STORE(__bch2_fs) sc.nr_to_scan = strtoul_or_return(buf); c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); } +#ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; + char *test = strsep(&p, " \t\n"); + char *nr_str = strsep(&p, " \t\n"); + char *threads_str = strsep(&p, " \t\n"); + unsigned threads; + u64 nr; + int ret = -EINVAL; + if (threads_str && + !(ret = kstrtouint(threads_str, 10, &threads)) && + !(ret = bch2_strtoull_h(nr_str, &nr))) + bch2_btree_perf_test(c, test, nr, threads); + else + size = ret; + kfree(tmp); + } +#endif return size; } @@ -477,6 +500,10 @@ struct attribute *bch2_fs_files[] = { &sysfs_promote_whole_extents, &sysfs_compression_stats, + +#ifdef CONFIG_BCACHEFS_TESTS + &sysfs_perf_test, +#endif NULL }; diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c new file mode 100644 index 00000000..9dcadd20 --- /dev/null +++ b/libbcachefs/tests.c @@ -0,0 +1,289 @@ +#ifdef CONFIG_BCACHEFS_TESTS + +#include "bcachefs.h" +#include "btree_update.h" +#include "tests.h" + +#include "linux/kthread.h" +#include "linux/random.h" + +static void test_delete(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + + bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter); + BUG_ON(ret); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + BTREE_INSERT_ENTRY(&iter, &k.k_i)); + BUG_ON(ret); + + pr_info("deleting once"); + ret = bch2_btree_delete_at(&iter, 0); + BUG_ON(ret); + + pr_info("deleting twice"); + ret = bch2_btree_delete_at(&iter, 0); + BUG_ON(ret); + + bch2_btree_iter_unlock(&iter); +} + +static u64 test_rand(void) +{ + u64 v; +#if 0 + v = prandom_u32(); +#else + prandom_bytes(&v, sizeof(v)); +#endif + return v; +} + +static void rand_insert(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie k; + int ret; + u64 i; + + for (i = 0; i < nr; i++) { + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + NULL, NULL, NULL, 0); + BUG_ON(ret); + } +} + +static void rand_lookup(struct bch_fs *c, u64 nr) +{ + u64 i; + + for (i = 0; i < nr; i++) { + struct btree_iter iter; + struct bkey_s_c k; + + bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, + POS(0, test_rand()), 0); + + k = bch2_btree_iter_peek(&iter); + bch2_btree_iter_unlock(&iter); + } +} + +static void rand_mixed(struct bch_fs *c, u64 nr) +{ + int ret; + u64 i; + + for (i = 0; i < nr; i++) { + struct btree_iter iter; + struct bkey_s_c k; + + bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, + POS(0, test_rand()), 0); + + k = bch2_btree_iter_peek(&iter); + + if (!(i & 3) && k.k) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p = iter.pos; + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + BTREE_INSERT_ENTRY(&iter, &k.k_i)); + BUG_ON(ret); + } + + bch2_btree_iter_unlock(&iter); + } + +} + +static void rand_delete(struct bch_fs *c, u64 nr) +{ + struct bkey_i k; + int ret; + u64 i; + + for (i = 0; i < nr; i++) { + bkey_init(&k.k); + k.k.p.offset = test_rand(); + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k, + NULL, NULL, NULL, 0); + BUG_ON(ret); + } +} + +static void seq_insert(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie insert; + int ret; + u64 i = 0; + + bkey_cookie_init(&insert.k_i); + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { + insert.k.p = iter.pos; + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + BTREE_INSERT_ENTRY(&iter, &insert.k_i)); + BUG_ON(ret); + + if (++i == nr) + break; + } + bch2_btree_iter_unlock(&iter); +} + +static void seq_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) + ; + bch2_btree_iter_unlock(&iter); +} + +static void seq_overwrite(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_INTENT, k) { + struct bkey_i_cookie u; + + bkey_reassemble(&u.k_i, k); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + BTREE_INSERT_ENTRY(&iter, &u.k_i)); + BUG_ON(ret); + } + bch2_btree_iter_unlock(&iter); +} + +static void seq_delete(struct bch_fs *c, u64 nr) +{ + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + POS_MIN, POS_MAX, + ZERO_VERSION, NULL, NULL, NULL); + BUG_ON(ret); +} + +typedef void (*perf_test_fn)(struct bch_fs *, u64); + +struct test_job { + struct bch_fs *c; + u64 nr; + unsigned nr_threads; + perf_test_fn fn; + + atomic_t ready; + wait_queue_head_t ready_wait; + + atomic_t done; + struct completion done_completion; + + u64 start; + u64 finish; +}; + +static int btree_perf_test_thread(void *data) +{ + struct test_job *j = data; + + if (atomic_dec_and_test(&j->ready)) { + wake_up(&j->ready_wait); + j->start = sched_clock(); + } else { + wait_event(j->ready_wait, !atomic_read(&j->ready)); + } + + j->fn(j->c, j->nr / j->nr_threads); + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); + complete(&j->done_completion); + } + + return 0; +} + +void bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) +{ + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; + char name_buf[20], nr_buf[20], per_sec_buf[20]; + unsigned i; + u64 time; + + atomic_set(&j.ready, nr_threads); + init_waitqueue_head(&j.ready_wait); + + atomic_set(&j.done, nr_threads); + init_completion(&j.done_completion); + +#define perf_test(_test) \ + if (!strcmp(testname, #_test)) j.fn = _test + + perf_test(rand_insert); + perf_test(rand_lookup); + perf_test(rand_mixed); + perf_test(rand_delete); + + perf_test(seq_insert); + perf_test(seq_lookup); + perf_test(seq_overwrite); + perf_test(seq_delete); + + /* a unit test, not a perf test: */ + perf_test(test_delete); + + if (!j.fn) { + pr_err("unknown test %s", testname); + return; + } + + //pr_info("running test %s:", testname); + + if (nr_threads == 1) + btree_perf_test_thread(&j); + else + for (i = 0; i < nr_threads; i++) + kthread_run(btree_perf_test_thread, &j, + "bcachefs perf test[%u]", i); + + while (wait_for_completion_interruptible(&j.done_completion)) + ; + + time = j.finish - j.start; + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); + bch2_hprint(nr_buf, nr); + bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", + name_buf, nr_buf, nr_threads, + time / NSEC_PER_SEC, + time * nr_threads / nr, + per_sec_buf); +} + +#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/libbcachefs/tests.h b/libbcachefs/tests.h new file mode 100644 index 00000000..3f1b8d1f --- /dev/null +++ b/libbcachefs/tests.h @@ -0,0 +1,14 @@ +#ifndef _BCACHEFS_TEST_H +#define _BCACHEFS_TEST_H + +struct bch_fs; + +#ifdef CONFIG_BCACHEFS_TESTS + +void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); + +#else + +#endif /* CONFIG_BCACHEFS_TESTS */ + +#endif /* _BCACHEFS_TEST_H */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index e263dd20..24c6cc56 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -27,55 +27,73 @@ #define simple_strtoint(c, end, base) simple_strtol(c, end, base) #define simple_strtouint(c, end, base) simple_strtoul(c, end, base) +static const char si_units[] = "?kMGTPEZY"; + +static int __bch2_strtoh(const char *cp, u64 *res, + u64 t_max, bool t_signed) +{ + bool positive = *cp != '-'; + unsigned u; + u64 v = 0; + + if (*cp == '+' || *cp == '-') + cp++; + + if (!isdigit(*cp)) + return -EINVAL; + + do { + if (v > U64_MAX / 10) + return -ERANGE; + v *= 10; + if (v > U64_MAX - (*cp - '0')) + return -ERANGE; + v += *cp - '0'; + cp++; + } while (isdigit(*cp)); + + for (u = 1; u < ARRAY_SIZE(si_units); u++) + if (*cp == si_units[u]) { + cp++; + goto got_unit; + } + u = 0; +got_unit: + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + + if (fls64(v) + u * 10 > 64) + return -ERANGE; + + v <<= u * 10; + + if (positive) { + if (v > t_max) + return -ERANGE; + } else { + if (v && !t_signed) + return -ERANGE; + + if (v > t_max + 1) + return -ERANGE; + v = -v; + } + + *res = v; + return 0; +} + #define STRTO_H(name, type) \ int bch2_ ## name ## _h(const char *cp, type *res) \ { \ - int u = 0; \ - char *e; \ - type i = simple_ ## name(cp, &e, 10); \ - \ - switch (tolower(*e)) { \ - default: \ - return -EINVAL; \ - case 'y': \ - case 'z': \ - u++; \ - case 'e': \ - u++; \ - case 'p': \ - u++; \ - case 't': \ - u++; \ - case 'g': \ - u++; \ - case 'm': \ - u++; \ - case 'k': \ - u++; \ - if (e++ == cp) \ - return -EINVAL; \ - case '\n': \ - case '\0': \ - if (*e == '\n') \ - e++; \ - } \ - \ - if (*e) \ - return -EINVAL; \ - \ - while (u--) { \ - if ((type) ~0 > 0 && \ - (type) ~0 / 1024 <= i) \ - return -EINVAL; \ - if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ - (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ - return -EINVAL; \ - i *= 1024; \ - } \ - \ - *res = i; \ - return 0; \ -} \ + u64 v; \ + int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ + ANYSINT_MAX(type) != ((type) ~0ULL)); \ + *res = v; \ + return ret; \ +} STRTO_H(strtoint, int) STRTO_H(strtouint, unsigned int) @@ -84,7 +102,6 @@ STRTO_H(strtoull, unsigned long long) ssize_t bch2_hprint(char *buf, s64 v) { - static const char units[] = "?kMGTPEZY"; char dec[4] = ""; int u, t = 0; @@ -103,7 +120,7 @@ ssize_t bch2_hprint(char *buf, s64 v) if (v < 100 && v > -100) scnprintf(dec, sizeof(dec), ".%i", t / 103); - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + return sprintf(buf, "%lli%s%c", v, dec, si_units[u]); } ssize_t bch2_scnprint_string_list(char *buf, size_t size, diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index c89c7200..de95480c 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -15,7 +15,7 @@ static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { - return DIV_ROUND_UP(sizeof(struct bch_xattr) + + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + name_len + val_len, sizeof(u64)); }