diff --git a/.bcachefs_revision b/.bcachefs_revision index 45b79dea..193e4241 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -e3a7cee5034f0f218f593a0a970e8ccd8bf99565 +69be0dae3162e1651a5d5fcce08562e6e2af971a diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4300c4da..f60972c7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -6,7 +6,7 @@ #include #include -#define BIO_MAX_PAGES 256 +#define BIO_MAX_VECS 256 typedef unsigned fmode_t; diff --git a/include/linux/xattr.h b/include/linux/xattr.h index fbc1e1f5..222c72fe 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -26,6 +26,7 @@ struct inode; struct dentry; +struct user_namespace; /* * struct xattr_handler: When @name is set, match attributes with exactly that @@ -40,7 +41,8 @@ struct xattr_handler { int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); - int (*set)(const struct xattr_handler *, struct dentry *dentry, + int (*set)(const struct xattr_handler *, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index b5fcda9e..4c0d9b76 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -71,10 +71,10 @@ DECLARE_EVENT_CLASS(bio, ), TP_fast_assign( - __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; + __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u", diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 594e1f1a..74cb188f 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -281,7 +281,8 @@ int bch2_set_acl_trans(struct btree_trans *trans, return ret == -ENOENT ? 0 : ret; } -int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) +int bch2_set_acl(struct user_namespace *mnt_userns, + struct inode *vinode, struct posix_acl *_acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -308,7 +309,7 @@ retry: mode = inode_u.bi_mode; if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(&inode->v, &mode, &acl); + ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); if (ret) goto btree_err; } diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index ba210c26..25fc54dd 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -32,7 +32,7 @@ int bch2_set_acl_trans(struct btree_trans *, struct bch_inode_unpacked *, const struct bch_hash_info *, struct posix_acl *, int); -int bch2_set_acl(struct inode *, struct posix_acl *, int); +int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, umode_t, struct posix_acl **); diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 9f869bed..09e5dbf1 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -84,7 +84,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ } -static const struct bkey_ops bch2_bkey_ops[] = { +const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() #undef x @@ -290,24 +290,11 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) : false; } -enum merge_result bch2_bkey_merge(struct bch_fs *c, - struct bkey_s l, struct bkey_s r) +bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; - enum merge_result ret; - if (bch2_key_merging_disabled || - !ops->key_merge || - l.k->type != r.k->type || - bversion_cmp(l.k->version, r.k->version) || - bpos_cmp(l.k->p, bkey_start_pos(r.k))) - return BCH_MERGE_NOMERGE; - - ret = ops->key_merge(c, l, r); - - if (ret != BCH_MERGE_NOMERGE) - l.k->needs_whiteout |= r.k->needs_whiteout; - return ret; + return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r); } static const struct old_bkey_type { diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index bfa6f112..3012035d 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -11,17 +11,6 @@ enum btree_node_type; extern const char * const bch2_bkey_types[]; -enum merge_result { - BCH_MERGE_NOMERGE, - - /* - * The keys were mergeable, but would have overflowed size - so instead - * l was changed to the maximum size, and both keys were modified: - */ - BCH_MERGE_PARTIAL, - BCH_MERGE_MERGE, -}; - struct bkey_ops { /* Returns reason for being invalid if invalid, else NULL: */ const char * (*key_invalid)(const struct bch_fs *, @@ -30,13 +19,14 @@ struct bkey_ops { struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); - enum merge_result (*key_merge)(struct bch_fs *, - struct bkey_s, struct bkey_s); + bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); }; +extern const struct bkey_ops bch2_bkey_ops[]; + const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type); @@ -57,8 +47,17 @@ void bch2_bkey_swab_val(struct bkey_s); bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -enum merge_result bch2_bkey_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) +{ + return l->type == r->type && + !bversion_cmp(l->version, r->version) && + !bpos_cmp(l->p, bkey_start_pos(r)) && + (u64) l->size + r->size <= KEY_SIZE_MAX && + bch2_bkey_ops[l->type].key_merge && + !bch2_key_merging_disabled; +} + +bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 013cf0b5..12bc2946 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -654,13 +654,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return NULL; } - /* - * Unlock before doing IO: - * - * XXX: ideally should be dropping all btree node locks here - */ - if (iter && btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); + /* Unlock before doing IO: */ + if (iter && sync) + bch2_trans_unlock(iter->trans); bch2_btree_node_read(c, b, sync); @@ -671,6 +667,16 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return NULL; } + /* + * XXX: this will probably always fail because btree_iter_relock() + * currently fails for iterators that aren't pointed at a valid btree + * node + */ + if (iter && !bch2_trans_relock(iter->trans)) { + six_unlock_intent(&b->c.lock); + return ERR_PTR(-EINTR); + } + if (lock_type == SIX_LOCK_read) six_lock_downgrade(&b->c.lock); @@ -815,9 +821,22 @@ lock_node: } } - /* XXX: waiting on IO with btree locks held: */ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); + if (unlikely(btree_node_read_in_flight(b))) { + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(iter->trans); + + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + + /* + * XXX: check if this always fails - btree_iter_relock() + * currently fails for iterators that aren't pointed at a valid + * btree node + */ + if (iter && !bch2_trans_relock(iter->trans)) + return ERR_PTR(-EINTR); + goto retry; + } prefetch(b->aux_data); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index ba560fbd..911196f0 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -36,6 +36,9 @@ #include #include +#define DROP_THIS_NODE 10 +#define DROP_PREV_NODE 11 + static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -229,11 +232,19 @@ static int btree_repair_node_start(struct bch_fs *c, struct btree *b, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) { if (prev && bpos_cmp(expected_start, cur->data->min_key) > 0 && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) + BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { + if (bkey_cmp(prev->data->min_key, + cur->data->min_key) <= 0) + return DROP_PREV_NODE; + ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - else + bpos_predecessor(cur->data->min_key)); + } else { + if (bkey_cmp(expected_start, b->data->max_key) >= 0) + return DROP_THIS_NODE; + ret = set_node_min(c, cur, expected_start); + } if (ret) return ret; } @@ -262,13 +273,11 @@ fsck_err: return ret; } -#define DROP_THIS_NODE 10 - static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) { struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_buf tmp; + struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; bool have_child, dropped_children = false; char buf[200]; @@ -278,14 +287,15 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) return 0; again: have_child = dropped_children = false; - bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_init(&prev_k); + bch2_bkey_buf_init(&cur_k); bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_btree_and_journal_iter_advance(&iter); - bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_bkey_buf_reassemble(&cur_k, c, k); - cur = bch2_btree_node_get_noiter(c, tmp.k, + cur = bch2_btree_node_get_noiter(c, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); @@ -295,10 +305,10 @@ again: " %s", bch2_btree_ids[b->c.btree_id], b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) { - bch2_btree_node_evict(c, tmp.k); + (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { + bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, tmp.k->k.p); + b->c.level, cur_k.k->k.p); if (ret) goto err; continue; @@ -313,11 +323,27 @@ again: ret = btree_repair_node_start(c, b, prev, cur); if (prev) six_unlock_read(&prev->c.lock); + + if (ret == DROP_PREV_NODE) { + bch2_btree_node_evict(c, prev_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, prev_k.k->k.p); + if (ret) + goto err; + goto again; + } else if (ret == DROP_THIS_NODE) { + bch2_btree_node_evict(c, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + if (ret) + goto err; + continue; + } else if (ret) + break; + prev = cur; cur = NULL; - - if (ret) - break; + bch2_bkey_buf_copy(&prev_k, c, cur_k.k); } if (!ret && !IS_ERR_OR_NULL(prev)) { @@ -339,10 +365,10 @@ again: bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_bkey_buf_reassemble(&cur_k, c, k); bch2_btree_and_journal_iter_advance(&iter); - cur = bch2_btree_node_get_noiter(c, tmp.k, + cur = bch2_btree_node_get_noiter(c, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); @@ -358,9 +384,9 @@ again: cur = NULL; if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(c, tmp.k); + bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, tmp.k->k.p); + b->c.level, cur_k.k->k.p); dropped_children = true; } @@ -385,7 +411,8 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&tmp, c); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); if (!ret && dropped_children) goto again; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index cd714dc2..38609422 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -18,6 +18,9 @@ #include static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long); +static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *); +static void btree_iter_copy(struct btree_iter *, struct btree_iter *); static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) { @@ -854,10 +857,9 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, /* peek_all() doesn't skip deleted keys */ static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, - struct btree_iter_level *l, - struct bkey *u) + struct btree_iter_level *l) { - return __btree_iter_unpack(iter, l, u, + return __btree_iter_unpack(iter, l, &iter->k, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } @@ -1184,7 +1186,11 @@ static __always_inline int btree_iter_down(struct btree_iter *iter, if (iter->flags & BTREE_ITER_PREFETCH) btree_iter_prefetch(iter); + if (btree_node_read_locked(iter, level + 1)) + btree_node_unlock(iter, level + 1); iter->level = level; + + bch2_btree_iter_verify_locks(iter); err: bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1637,15 +1643,18 @@ static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) return ret; } -static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) +static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, + struct bpos pos) { struct btree_insert_entry *i; - trans_for_each_update2(trans, i) - if ((cmp_int(btree_id, i->iter->btree_id) ?: - bkey_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->iter->btree_id) + if (!(iter->flags & BTREE_ITER_WITH_UPDATES)) + return NULL; + + trans_for_each_update(iter->trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: + bkey_cmp(pos, i->k->k.p)) <= 0) { + if (iter->btree_id == i->iter->btree_id) return i->k; break; } @@ -1653,7 +1662,11 @@ static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, return NULL; } -static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates) +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { struct bpos search_key = btree_iter_search_key(iter); struct bkey_i *next_update; @@ -1664,9 +1677,7 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); start: - next_update = with_updates - ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) - : NULL; + next_update = btree_trans_peek_updates(iter, search_key); btree_iter_set_search_pos(iter, search_key); while (1) { @@ -1677,8 +1688,10 @@ start: k = btree_iter_level_peek(iter, &iter->l[0]); if (next_update && - bpos_cmp(next_update->k.p, iter->real_pos) <= 0) + bpos_cmp(next_update->k.p, iter->real_pos) <= 0) { + iter->k = next_update->k; k = bkey_i_to_s_c(next_update); + } if (likely(k.k)) { if (bkey_deleted(k.k)) { @@ -1708,15 +1721,6 @@ start: return k; } -/** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position - */ -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -{ - return __btree_iter_peek(iter, false); -} - /** * bch2_btree_iter_next: returns first key greater than iterator's current * position @@ -1729,19 +1733,6 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } -struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -{ - return __btree_iter_peek(iter, true); -} - -struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_with_updates(iter); -} - /** * bch2_btree_iter_peek_prev: returns first key less than or equal to * iterator's current position @@ -1753,6 +1744,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -1814,52 +1806,9 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) return bch2_btree_iter_peek_prev(iter); } -static inline struct bkey_s_c -__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) -{ - struct bkey_s_c k; - struct bpos pos, next_start; - - /* keys & holes can't span inode numbers: */ - if (iter->pos.offset == KEY_OFFSET_MAX) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; - - bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); - } - - pos = iter->pos; - k = bch2_btree_iter_peek(iter); - iter->pos = pos; - - if (bkey_err(k)) - return k; - - if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) - return k; - - next_start = k.k ? bkey_start_pos(k.k) : POS_MAX; - - bkey_init(&iter->k); - iter->k.p = iter->pos; - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next_start.inode == iter->pos.inode - ? next_start.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); - - EBUG_ON(!iter->k.size); - - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); - - return (struct bkey_s_c) { &iter->k, NULL }; -} - struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; + struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; int ret; @@ -1867,24 +1816,78 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + btree_iter_set_search_pos(iter, search_key); - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return __bch2_btree_iter_peek_slot_extents(iter); + /* extents can't span inode numbers: */ + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + iter->pos.offset == KEY_OFFSET_MAX) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + } ret = btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); - k = btree_iter_level_peek_all(iter, l, &iter->k); + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { + struct bkey_i *next_update = btree_trans_peek_updates(iter, search_key); - EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); + k = btree_iter_level_peek_all(iter, &iter->l[0]); + EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); - if (!k.k || bkey_cmp(iter->pos, k.k->p)) { - /* hole */ - bkey_init(&iter->k); - iter->k.p = iter->pos; - k = (struct bkey_s_c) { &iter->k, NULL }; + if (next_update && + (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } + } else { + if ((iter->flags & BTREE_ITER_INTENT)) { + struct btree_iter *child = + btree_iter_child_alloc(iter, _THIS_IP_); + + btree_iter_copy(child, iter); + k = bch2_btree_iter_peek(child); + + if (k.k && !bkey_err(k)) + iter->k = child->k; + } else { + struct bpos pos = iter->pos; + + k = bch2_btree_iter_peek(iter); + iter->pos = pos; + } + + if (unlikely(bkey_err(k))) + return k; + } + + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { + if (!k.k || + ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS) + ? bpos_cmp(iter->pos, k.k->p) + : bkey_cmp(iter->pos, k.k->p))) { + bkey_init(&iter->k); + iter->k.p = iter->pos; + k = (struct bkey_s_c) { &iter->k, NULL }; + } + } else { + struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX; + + if (bkey_cmp(iter->pos, next) < 0) { + bkey_init(&iter->k); + iter->k.p = iter->pos; + bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, + (next.inode == iter->pos.inode + ? next.offset + : KEY_OFFSET_MAX) - + iter->pos.offset)); + + k = (struct bkey_s_c) { &iter->k, NULL }; + EBUG_ON(!k.k->size); + } } bch2_btree_iter_verify_entry_exit(iter); @@ -1912,12 +1915,17 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) { + struct bkey_i *next_update; struct bkey_cached *ck; int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); bch2_btree_iter_verify(iter); + next_update = btree_trans_peek_updates(iter, iter->pos); + if (next_update && !bpos_cmp(next_update->k.p, iter->pos)) + return bkey_i_to_s_c(next_update); + ret = btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); @@ -1956,9 +1964,39 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, /* new transactional stuff: */ +static void btree_iter_child_free(struct btree_iter *iter) +{ + struct btree_iter *child = btree_iter_child(iter); + + if (child) { + bch2_trans_iter_free(iter->trans, child); + iter->child_idx = U8_MAX; + } +} + +static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, + unsigned long ip) +{ + struct btree_trans *trans = iter->trans; + struct btree_iter *child = btree_iter_child(iter); + + if (!child) { + child = btree_trans_iter_alloc(trans); + child->ip_allocated = ip; + iter->child_idx = child->idx; + + trans->iters_live |= 1ULL << child->idx; + trans->iters_touched |= 1ULL << child->idx; + } + + return child; +} + static inline void __bch2_trans_iter_free(struct btree_trans *trans, unsigned idx) { + btree_iter_child_free(&trans->iters[idx]); + __bch2_btree_iter_unlock(&trans->iters[idx]); trans->iters_linked &= ~(1ULL << idx); trans->iters_live &= ~(1ULL << idx); @@ -2026,6 +2064,7 @@ static void btree_trans_iter_alloc_fail(struct btree_trans *trans) static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) { + struct btree_iter *iter; unsigned idx; if (unlikely(trans->iters_linked == @@ -2033,21 +2072,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) btree_trans_iter_alloc_fail(trans); idx = __ffs64(~trans->iters_linked); + iter = &trans->iters[idx]; + iter->trans = trans; + iter->idx = idx; + iter->child_idx = U8_MAX; + iter->flags = 0; + iter->nodes_locked = 0; + iter->nodes_intent_locked = 0; trans->iters_linked |= 1ULL << idx; - trans->iters[idx].idx = idx; - trans->iters[idx].flags = 0; - return &trans->iters[idx]; + return iter; } -static inline void btree_iter_copy(struct btree_iter *dst, - struct btree_iter *src) +static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) { - unsigned i, idx = dst->idx; + unsigned i; - *dst = *src; - dst->idx = idx; - dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + __bch2_btree_iter_unlock(dst); + btree_iter_child_free(dst); + + memcpy(&dst->flags, &src->flags, + sizeof(struct btree_iter) - offsetof(struct btree_iter, flags)); for (i = 0; i < BTREE_MAX_DEPTH; i++) if (btree_node_locked(dst, i)) @@ -2237,6 +2282,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) p = trans->mem + trans->mem_top; trans->mem_top += size; + memset(p, 0, size); return p; } @@ -2267,7 +2313,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->iters_touched &= trans->iters_live; trans->nr_updates = 0; - trans->nr_updates2 = 0; trans->mem_top = 0; trans->hooks = NULL; @@ -2305,7 +2350,6 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) trans->iters = p; p += iters_bytes; trans->updates = p; p += updates_bytes; - trans->updates2 = p; p += updates_bytes; } void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, @@ -2351,6 +2395,13 @@ int bch2_trans_exit(struct btree_trans *trans) bch2_trans_unlock(trans); #ifdef CONFIG_BCACHEFS_DEBUG + if (trans->iters_live) { + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) + btree_iter_child_free(iter); + } + if (trans->iters_live) { struct btree_iter *iter; @@ -2502,7 +2553,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) return init_srcu_struct(&c->btree_trans_barrier) ?: mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + - sizeof(struct btree_insert_entry) * nr + sizeof(struct btree_insert_entry) * nr) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, BTREE_TRANS_MEM_MAX); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index a2ce711f..ba98cfea 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -153,9 +153,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); - struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); @@ -181,6 +178,12 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos iter->should_be_locked = false; } +static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) +{ + return iter->child_idx == U8_MAX ? NULL + : iter->trans->iters + iter->child_idx; +} + /* Sort order for locking btree iterators: */ static inline int btree_iter_lock_cmp(const struct btree_iter *l, const struct btree_iter *r) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index bc0f482b..b37096f7 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -209,12 +209,13 @@ enum btree_iter_type { * @pos or the first key strictly greater than @pos */ #define BTREE_ITER_IS_EXTENTS (1 << 6) -#define BTREE_ITER_ERROR (1 << 7) -#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) -#define BTREE_ITER_CACHED_NOFILL (1 << 9) -#define BTREE_ITER_CACHED_NOCREATE (1 << 10) -#define BTREE_ITER_NOT_EXTENTS (1 << 11) -#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_NOT_EXTENTS (1 << 7) +#define BTREE_ITER_ERROR (1 << 8) +#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 9) +#define BTREE_ITER_CACHED_NOFILL (1 << 10) +#define BTREE_ITER_CACHED_NOCREATE (1 << 11) +#define BTREE_ITER_WITH_UPDATES (1 << 12) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -241,15 +242,20 @@ enum btree_iter_uptodate { */ struct btree_iter { struct btree_trans *trans; - struct bpos pos; - /* what we're searching for/what the iterator actually points to: */ - struct bpos real_pos; - struct bpos pos_after_commit; + unsigned long ip_allocated; + + u8 idx; + u8 child_idx; + + /* btree_iter_copy starts here: */ + u16 flags; + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ unsigned snapshot; - u16 flags; - u8 idx; + struct bpos pos; + struct bpos real_pos; + struct bpos pos_after_commit; enum btree_id btree_id:4; enum btree_iter_uptodate uptodate:3; @@ -276,7 +282,6 @@ struct btree_iter { * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; - unsigned long ip_allocated; }; static inline enum btree_iter_type @@ -340,7 +345,6 @@ struct btree_insert_entry { enum btree_id btree_id:8; u8 level; unsigned trans_triggers_run:1; - unsigned is_extent:1; struct bkey_i *k; struct btree_iter *iter; }; @@ -376,7 +380,6 @@ struct btree_trans { int srcu_idx; u8 nr_updates; - u8 nr_updates2; unsigned used_mempool:1; unsigned error:1; unsigned in_traverse_all:1; @@ -391,7 +394,6 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; - struct btree_insert_entry *updates2; /* update path: */ struct btree_trans_commit_hook *hooks; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 56131ac5..cbfc8544 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -140,9 +140,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_update2(_trans, _i) \ - for ((_i) = (_trans)->updates2; \ - (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ - (_i)++) - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 0d566be7..482d583e 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -32,7 +32,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { - return i != trans->updates2 && + return i != trans->updates && iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; } @@ -222,7 +222,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); + BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos)); BUG_ON(i->level != i->iter->level); BUG_ON(i->btree_id != i->iter->btree_id); } @@ -400,7 +400,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, h = h->next; } - trans_for_each_update2(trans, i) { + trans_for_each_update(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; @@ -458,10 +458,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (bch2_journal_seq_verify) - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; else if (bch2_inject_invalid_keys) - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) i->k->k.version = MAX_VERSION; } @@ -476,7 +476,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) do_btree_insert_one(trans, i); err: if (marking) { @@ -504,7 +504,7 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree BUG_ON(iter->level); - trans_for_each_update2(trans, i) { + trans_for_each_update(trans, i) { if (iter_l(i->iter)->b != b) continue; @@ -535,7 +535,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, struct btree_iter *iter; int ret; - trans_for_each_update2(trans, i) { + trans_for_each_update(trans, i) { struct btree *b; BUG_ON(!btree_node_intent_locked(i->iter, i->level)); @@ -552,7 +552,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } } - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) BUG_ON(!btree_node_intent_locked(i->iter, i->level)); ret = bch2_journal_preres_get(&c->journal, @@ -592,7 +592,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } } - trans_for_each_update2(trans, i) { + trans_for_each_update(trans, i) { const char *invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type); if (invalid) { @@ -606,14 +606,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } bch2_btree_trans_verify_locks(trans); - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_lock_for_insert(c, iter_l(i->iter)->b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, i->iter); @@ -775,132 +775,117 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } -static void __bch2_trans_update2(struct btree_trans *trans, - struct btree_insert_entry n) -{ - struct btree_insert_entry *i; - - btree_insert_entry_checks(trans, &n); - - EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); - - n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - - trans_for_each_update2(trans, i) - if (btree_insert_entry_cmp(&n, i) <= 0) - break; - - if (i < trans->updates2 + trans->nr_updates2 && - !btree_insert_entry_cmp(&n, i)) - *i = n; - else - array_insert_item(trans->updates2, trans->nr_updates2, - i - trans->updates2, n); -} - -static void bch2_trans_update2(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - __bch2_trans_update2(trans, (struct btree_insert_entry) { - .bkey_type = __btree_node_type(iter->level, iter->btree_id), - .btree_id = iter->btree_id, - .level = iter->level, - .iter = iter, - .k = insert, - }); -} - -static int extent_update_to_keys(struct btree_trans *trans, - struct btree_insert_entry n) +static int __btree_delete_at(struct btree_trans *trans, enum btree_id btree_id, + struct bpos pos, unsigned trigger_flags) { + struct btree_iter *iter; + struct bkey_i *update; int ret; - ret = bch2_extent_can_insert(trans, n.iter, n.k); - if (ret) + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) return ret; - if (bkey_deleted(&n.k->k)) - return 0; + bkey_init(&update->k); + update->k.p = pos; - n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS); - n.is_extent = false; - - __bch2_trans_update2(trans, n); - bch2_trans_iter_put(trans, n.iter); + iter = bch2_trans_get_iter(trans, btree_id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + bch2_trans_update(trans, iter, update, trigger_flags); + bch2_trans_iter_put(trans, iter); return 0; } static int extent_handle_overwrites(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_i *insert) + struct btree_insert_entry *i) { + struct bch_fs *c = trans->c; struct btree_iter *iter, *update_iter; - struct bpos start = bkey_start_pos(&insert->k); + struct bpos start = bkey_start_pos(&i->k->k); struct bkey_i *update; struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, btree_id, start, - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_with_updates(iter); + iter = bch2_trans_get_iter(trans, i->btree_id, start, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek(iter); + if (!k.k || (ret = bkey_err(k))) + goto out; - while (k.k && !(ret = bkey_err(k))) { - if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) - break; + if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) { + struct bpos l_pos = k.k->p; + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto out; + + bkey_reassemble(update, k); + + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) { + ret = __btree_delete_at(trans, i->btree_id, l_pos, + i->trigger_flags); + if (ret) + goto out; + + i->k = update; + goto next; + } + } + + if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) + goto next; + + while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) { if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) - break; + goto out; bkey_reassemble(update, k); bch2_cut_back(start, update); - update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, BTREE_ITER_NOT_EXTENTS| BTREE_ITER_INTENT); - bch2_trans_update2(trans, update_iter, update); + bch2_trans_update(trans, update_iter, update, i->trigger_flags); bch2_trans_iter_put(trans, update_iter); } - if (bkey_cmp(k.k->p, insert->k.p) < 0 || - (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); - if ((ret = PTR_ERR_OR_ZERO(update))) - break; - - bkey_init(&update->k); - update->k.p = k.k->p; - - update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - bch2_trans_update2(trans, update_iter, update); - bch2_trans_iter_put(trans, update_iter); + if (bkey_cmp(k.k->p, i->k->k.p) <= 0) { + ret = __btree_delete_at(trans, i->btree_id, k.k->p, + i->trigger_flags); + if (ret) + goto out; } - if (bkey_cmp(k.k->p, insert->k.p) > 0) { + if (bkey_cmp(k.k->p, i->k->k.p) > 0) { update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) - break; + goto out; bkey_reassemble(update, k); - bch2_cut_front(insert->k.p, update); + bch2_cut_front(i->k->k.p, update); - update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, BTREE_ITER_NOT_EXTENTS| BTREE_ITER_INTENT); - bch2_trans_update2(trans, update_iter, update); + bch2_trans_update(trans, update_iter, update, + i->trigger_flags); bch2_trans_iter_put(trans, update_iter); - break; + goto out; } - - k = bch2_btree_iter_next_with_updates(iter); +next: + k = bch2_btree_iter_next(iter); + if (!k.k || (ret = bkey_err(k))) + goto out; } + + bch2_bkey_merge(c, bkey_i_to_s(i->k), k); +out: bch2_trans_iter_put(trans, iter); return ret; @@ -966,23 +951,7 @@ int __bch2_trans_commit(struct btree_trans *trans) } } while (trans_trigger_run); - /* Turn extents updates into keys: */ - trans_for_each_update(trans, i) - if (i->is_extent) { - ret = extent_handle_overwrites(trans, i->btree_id, i->k); - if (unlikely(ret)) - goto out; - } - trans_for_each_update(trans, i) { - ret = i->is_extent - ? extent_update_to_keys(trans, *i) - : (__bch2_trans_update2(trans, *i), 0); - if (unlikely(ret)) - goto out; - } - - trans_for_each_update2(trans, i) { ret = bch2_btree_iter_traverse(i->iter); if (unlikely(ret)) { trace_trans_restart_traverse(trans->ip, _RET_IP_, @@ -1051,117 +1020,66 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, .bkey_type = __btree_node_type(iter->level, iter->btree_id), .btree_id = iter->btree_id, .level = iter->level, - .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0, .iter = iter, .k = k }; + bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0; + int ret = 0; BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); #ifdef CONFIG_BCACHEFS_DEBUG BUG_ON(bkey_cmp(iter->pos, - n.is_extent ? bkey_start_pos(&k->k) : k->k.p)); + is_extent ? bkey_start_pos(&k->k) : k->k.p)); trans_for_each_update(trans, i) { - BUG_ON(bkey_cmp(i->iter->pos, - i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p)); + BUG_ON(bkey_cmp(i->iter->pos, i->k->k.p)); BUG_ON(i != trans->updates && btree_insert_entry_cmp(i - 1, i) >= 0); } #endif - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + if (is_extent) { + ret = bch2_extent_can_insert(trans, n.iter, n.k); + if (ret) + return ret; + + ret = extent_handle_overwrites(trans, &n); + if (ret) + return ret; - if (n.is_extent) { iter->pos_after_commit = k->k.p; iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; + + if (bkey_deleted(&n.k->k)) + return 0; + + n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); + bch2_trans_iter_put(trans, n.iter); } + BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS); + + n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + /* * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: */ - if (!n.is_extent) { - trans_for_each_update(trans, i) - if (btree_insert_entry_cmp(&n, i) <= 0) - break; - - if (i < trans->updates + trans->nr_updates && - !btree_insert_entry_cmp(&n, i)) - *i = n; - else - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - } else { - trans_for_each_update(trans, i) - if (btree_insert_entry_cmp(&n, i) < 0) - break; - - while (i > trans->updates && - i[-1].btree_id == n.btree_id && - bkey_cmp(bkey_start_pos(&n.k->k), - bkey_start_pos(&i[-1].k->k)) <= 0) { - --i; - array_remove_item(trans->updates, trans->nr_updates, - i - trans->updates); - } - - if (i > trans->updates && - i[-1].btree_id == n.btree_id && - bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0) - bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k); - - if (i < trans->updates + trans->nr_updates && - i->btree_id == n.btree_id && - bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) { - if (bkey_cmp(bkey_start_pos(&n.k->k), - bkey_start_pos(&i->k->k)) > 0) { - struct btree_insert_entry split = *i; - int ret; - - BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX); - - split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k)); - ret = PTR_ERR_OR_ZERO(split.k); - if (ret) - return ret; - - bkey_copy(split.k, i->k); - bch2_cut_back(bkey_start_pos(&n.k->k), split.k); - - split.iter = bch2_trans_get_iter(trans, split.btree_id, - bkey_start_pos(&split.k->k), - BTREE_ITER_INTENT); - split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, split.iter); - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, split); - i++; - } - - /* - * When we have an extent that overwrites the start of another - * update, trimming that extent will mean the iterator's - * position has to change since the iterator position has to - * match the extent's start pos - but we don't want to change - * the iterator pos if some other code is using it, so we may - * need to clone it: - */ - if (btree_iter_live(trans, i->iter)) { - i->iter = bch2_trans_copy_iter(trans, i->iter); - - i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, i->iter); - } - - bch2_cut_front(n.k->k.p, i->k); - bch2_btree_iter_set_pos(i->iter, n.k->k.p); - } + trans_for_each_update(trans, i) + if (btree_insert_entry_cmp(&n, i) <= 0) + break; + if (i < trans->updates + trans->nr_updates && + !btree_insert_entry_cmp(&n, i)) { + BUG_ON(i->trans_triggers_run); + *i = n; + } else array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); - } return 0; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 76d15a5d..20862a4a 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1079,32 +1079,6 @@ static int bch2_mark_stripe(struct bch_fs *c, return 0; } -static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p, - u64 p_start, u64 p_end, - u64 v_start, u64 v_end) -{ - if (p_start == p_end) - return false; - - p_start += le64_to_cpu(p.v->idx); - p_end += le64_to_cpu(p.v->idx); - - if (p_end <= v_start) - return false; - if (p_start >= v_end) - return false; - return true; -} - -static int reflink_p_frag_references(struct bkey_s_c_reflink_p p, - u64 start, u64 end, - struct bkey_s_c k) -{ - return __reflink_p_frag_references(p, start, end, - bkey_start_offset(k.k), - k.k->p.offset); -} - static int __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, u64 idx, unsigned sectors, @@ -1115,7 +1089,6 @@ static int __bch2_mark_reflink_p(struct bch_fs *c, { struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - int frags_referenced; while (1) { if (*r_idx >= c->reflink_gc_nr) @@ -1128,20 +1101,6 @@ static int __bch2_mark_reflink_p(struct bch_fs *c, (*r_idx)++; } - frags_referenced = - __reflink_p_frag_references(p, 0, front_frag, - r->offset - r->size, r->offset) + - __reflink_p_frag_references(p, back_frag, p.k->size, - r->offset - r->size, r->offset); - - if (frags_referenced == 2) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); - add = -add; - } else if (frags_referenced == 1) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); - add = 0; - } - BUG_ON((s64) r->refcount + add < 0); r->refcount += add; @@ -1515,29 +1474,6 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans, return NULL; } -static int trans_get_key(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - struct btree_iter **iter, - struct bkey_s_c *k) -{ - unsigned flags = btree_id != BTREE_ID_alloc - ? BTREE_ITER_SLOTS - : BTREE_ITER_CACHED; - int ret; - - *iter = trans_get_update(trans, btree_id, pos, k); - if (*iter) - return 1; - - *iter = bch2_trans_get_iter(trans, btree_id, pos, - flags|BTREE_ITER_INTENT); - *k = __bch2_btree_iter_peek(*iter, flags); - ret = bkey_err(*k); - if (ret) - bch2_trans_iter_put(trans, *iter); - return ret; -} - static struct bkey_alloc_buf * bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, const struct bch_extent_ptr *ptr, @@ -1617,9 +1553,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_replicas_padded r; int ret = 0; - ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k); - if (ret < 0) - return ret; + iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; if (k.k->type != KEY_TYPE_stripe) { bch2_fs_inconsistent(c, @@ -1627,7 +1567,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, (u64) p.ec.idx); bch2_inconsistent_error(c); ret = -EIO; - goto out; + goto err; } if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { @@ -1635,13 +1575,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); ret = -EIO; - goto out; + goto err; } s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(s); if (ret) - goto out; + goto err; bkey_reassemble(&s->k_i, k); stripe_blockcount_set(&s->v, p.ec.block, @@ -1652,7 +1592,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); r.e.data_type = data_type; update_replicas_list(trans, &r.e, sectors); -out: +err: bch2_trans_iter_put(trans, iter); return ret; } @@ -1821,8 +1761,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 idx, unsigned sectors, - unsigned front_frag, - unsigned back_frag, unsigned flags) { struct bch_fs *c = trans->c; @@ -1831,28 +1769,18 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_i *n; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - int frags_referenced; s64 ret; - ret = trans_get_key(trans, BTREE_ID_reflink, - POS(0, idx), &iter, &k); - if (ret < 0) - return ret; + iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; sectors = min_t(u64, sectors, k.k->p.offset - idx); - frags_referenced = - reflink_p_frag_references(p, 0, front_frag, k) + - reflink_p_frag_references(p, back_frag, p.k->size, k); - - if (frags_referenced == 2) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); - add = -add; - } else if (frags_referenced == 1) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); - goto out; - } - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -1882,7 +1810,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ret = bch2_trans_update(trans, iter, n, 0); if (ret) goto err; -out: + ret = sectors; err: bch2_trans_iter_put(trans, iter); @@ -1894,20 +1822,15 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, s64 sectors, unsigned flags) { u64 idx = le64_to_cpu(p.v->idx) + offset; - unsigned front_frag, back_frag; s64 ret = 0; if (sectors < 0) sectors = -sectors; - BUG_ON(offset + sectors > p.k->size); - - front_frag = offset; - back_frag = offset + sectors; + BUG_ON(offset || sectors != p.k->size); while (sectors) { - ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, - front_frag, back_frag, flags); + ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); if (ret < 0) return ret; @@ -1990,86 +1913,27 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - if (!btree_node_type_is_extents(iter->btree_id)) { - if (btree_iter_type(iter) != BTREE_ITER_CACHED) { - old = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(old); - if (ret) - return ret; - } else { - struct bkey_cached *ck = (void *) iter->l[0].b; - - BUG_ON(!ck->valid); - old = bkey_i_to_s_c(ck->k); - } - - if (old.k->type == new->k.type) { - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, - BTREE_TRIGGER_INSERT|flags) ?: - bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, - BTREE_TRIGGER_OVERWRITE|flags); - } - } else { - struct btree_iter *copy; - struct bkey _old; - - EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); - - bkey_init(&_old); - old = (struct bkey_s_c) { &_old, NULL }; - - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), - 0, new->k.size, - BTREE_TRIGGER_INSERT); + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { + old = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(old); if (ret) return ret; + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; - copy = bch2_trans_copy_iter(trans, iter); + BUG_ON(!ck->valid); + old = bkey_i_to_s_c(ck->k); + } - for_each_btree_key_continue(copy, 0, old, ret) { - unsigned offset = 0; - s64 sectors = -((s64) old.k->size); - - flags |= BTREE_TRIGGER_OVERWRITE; - - if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) - break; - - switch (bch2_extent_overlap(&new->k, old.k)) { - case BCH_EXTENT_OVERLAP_ALL: - offset = 0; - sectors = -((s64) old.k->size); - break; - case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; - break; - case BCH_EXTENT_OVERLAP_FRONT: - offset = 0; - sectors = bkey_start_offset(old.k) - - new->k.p.offset; - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = -((s64) new->k.size); - flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; - break; - } - - BUG_ON(sectors >= 0); - - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), - offset, sectors, flags); - if (ret) - break; - } - bch2_trans_iter_put(trans, copy); + if (old.k->type == new->k.type && + !btree_node_type_is_extents(iter->btree_id)) { + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, new->k.size, + BTREE_TRIGGER_INSERT|flags) ?: + bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, -((s64) old.k->size), + BTREE_TRIGGER_OVERWRITE|flags); } return ret; diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index db6e4f6c..48f9232e 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -392,7 +392,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { - unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, DIV_ROUND_UP(bytes, PAGE_SIZE)); unsigned b = min_t(size_t, bytes - offset, nr_iovecs << PAGE_SHIFT); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index b07d3955..3968f1fd 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -230,112 +230,134 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { - struct bkey_s_extent l = bkey_s_to_extent(_l); - struct bkey_s_extent r = bkey_s_to_extent(_r); - union bch_extent_entry *en_l = l.v->start; - union bch_extent_entry *en_r = r.v->start; - struct bch_extent_crc_unpacked crc_l, crc_r; - - if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) - return BCH_MERGE_NOMERGE; - - crc_l = bch2_extent_crc_unpack(l.k, NULL); - - extent_for_each_entry(l, en_l) { - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return BCH_MERGE_NOMERGE; + return false; - switch (extent_entry_type(en_l)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *lp = &en_l->ptr; - const struct bch_extent_ptr *rp = &en_r->ptr; - struct bch_dev *ca; - - if (lp->offset + crc_l.compressed_size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; - - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); - - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: - if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || - en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) - return BCH_MERGE_NOMERGE; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.csum_type != crc_r.csum_type || - crc_l.compression_type != crc_r.compression_type || - crc_l.nonce != crc_r.nonce) - return BCH_MERGE_NOMERGE; - - if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || - crc_r.offset) - return BCH_MERGE_NOMERGE; - - if (!bch2_checksum_mergeable(crc_l.csum_type)) - return BCH_MERGE_NOMERGE; - - if (crc_is_compressed(crc_l)) - return BCH_MERGE_NOMERGE; - - if (crc_l.csum_type && - crc_l.uncompressed_size + - crc_r.uncompressed_size > c->sb.encoded_extent_max) - return BCH_MERGE_NOMERGE; - - if (crc_l.uncompressed_size + crc_r.uncompressed_size > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return BCH_MERGE_NOMERGE; - - break; - default: - return BCH_MERGE_NOMERGE; - } + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } - extent_for_each_entry(l, en_l) { - struct bch_extent_crc_unpacked crc_l, crc_r; + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); - if (!extent_entry_is_crc(en_l)) - continue; + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.has_ec != rp.has_ec) + return false; - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset ) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > c->sb.encoded_extent_max) + return false; + + if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset ) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* KEY_TYPE_reservation: */ @@ -363,25 +385,17 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, r.v->nr_replicas); } -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_reservation r = bkey_s_to_reservation(_r); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); if (l.v->generation != r.v->generation || l.v->nr_replicas != r.v->nr_replicas) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - bch2_cut_front_s(l.k->p, r.s); - return BCH_MERGE_PARTIAL; - } + return false; bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* Extent checksum entries: */ diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 9999805f..3f6224f7 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -394,8 +394,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_extent_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent (struct bkey_ops) { \ .key_invalid = bch2_extent_invalid, \ @@ -409,8 +408,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *, const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_reservation_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation (struct bkey_ops) { \ .key_invalid = bch2_reservation_invalid, \ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index ef289955..4ec3360b 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -893,7 +893,7 @@ void bch2_readahead(struct readahead_control *ractl) unsigned n = min_t(unsigned, readpages_iter.nr_pages - readpages_iter.idx, - BIO_MAX_PAGES); + BIO_MAX_VECS); struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), opts); @@ -1102,8 +1102,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, { struct bch_write_op *op; - w->io = container_of(bio_alloc_bioset(GFP_NOFS, - BIO_MAX_PAGES, + w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); @@ -1226,7 +1225,7 @@ do_io: (w->io->op.res.nr_replicas != nr_replicas_this_write || bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= - (BIO_MAX_PAGES * PAGE_SIZE) || + (BIO_MAX_VECS * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); @@ -1690,7 +1689,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) iter->count -= shorten; bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->dio_read_bioset); bio->bi_end_io = bch2_direct_IO_read_endio; @@ -1725,7 +1724,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->bio_read); bio->bi_end_io = bch2_direct_IO_read_split_endio; start: @@ -2029,7 +2028,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); init_completion(&dio->done); @@ -2271,7 +2270,8 @@ static int bch2_extend(struct bch_inode_info *inode, return ret; truncate_setsize(&inode->v, iattr->ia_size); - setattr_copy(&inode->v, iattr); + /* ATTR_MODE will never be set here, ns argument isn't needed: */ + setattr_copy(NULL, &inode->v, iattr); mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode_size(c, inode, inode->v.i_size, @@ -2389,7 +2389,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (unlikely(ret)) goto err; - setattr_copy(&inode->v, iattr); + /* ATTR_MODE will never be set here, ns argument isn't needed: */ + setattr_copy(NULL, &inode->v, iattr); mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index d8cc32e0..91a0e761 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -81,7 +81,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(&inode->v)) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ret = -EACCES; goto setflags_out; } @@ -152,7 +152,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(&inode->v)) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ret = -EACCES; goto err; } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index c567e176..a95358dd 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -244,11 +244,11 @@ static int inum_test(struct inode *inode, void *p) } static struct bch_inode_info * -__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, +__bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, bool tmpfile) { struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct user_namespace *ns = dir->v.i_sb->s_user_ns; struct btree_trans trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; @@ -284,8 +284,8 @@ retry: ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, !tmpfile ? &dentry->d_name : NULL, - from_kuid(ns, current_fsuid()), - from_kgid(ns, current_fsgid()), + from_kuid(mnt_userns, current_fsuid()), + from_kgid(mnt_userns, current_fsgid()), mode, rdev, default_acl, acl) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -382,11 +382,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, return d_splice_alias(vinode, dentry); } -static int bch2_mknod(struct inode *vdir, struct dentry *dentry, +static int bch2_mknod(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -395,10 +396,11 @@ static int bch2_mknod(struct inode *vdir, struct dentry *dentry, return 0; } -static int bch2_create(struct inode *vdir, struct dentry *dentry, +static int bch2_create(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { - return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); + return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, @@ -488,14 +490,15 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return ret; } -static int bch2_symlink(struct inode *vdir, struct dentry *dentry, +static int bch2_symlink(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, const char *symname) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -523,12 +526,14 @@ err: return ret; } -static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_mkdir(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); + return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0); } -static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, +static int bch2_rename2(struct user_namespace *mnt_userns, + struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { @@ -642,7 +647,8 @@ err: return ret; } -void bch2_setattr_copy(struct bch_inode_info *inode, +void bch2_setattr_copy(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct iattr *attr) { @@ -650,9 +656,9 @@ void bch2_setattr_copy(struct bch_inode_info *inode, unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); + bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid); if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); + bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); if (ia_valid & ATTR_ATIME) bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); @@ -668,13 +674,14 @@ void bch2_setattr_copy(struct bch_inode_info *inode, : inode->v.i_gid; if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) + !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID)) mode &= ~S_ISGID; bi->bi_mode = mode; } } -static int bch2_setattr_nonsize(struct bch_inode_info *inode, +static int bch2_setattr_nonsize(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -712,7 +719,7 @@ retry: if (ret) goto btree_err; - bch2_setattr_copy(inode, &inode_u, attr); + bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); @@ -745,7 +752,8 @@ err: return ret; } -static int bch2_getattr(const struct path *path, struct kstat *stat, +static int bch2_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned query_flags) { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); @@ -785,26 +793,28 @@ static int bch2_getattr(const struct path *path, struct kstat *stat, return 0; } -static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) +static int bch2_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); int ret; lockdep_assert_held(&inode->v.i_rwsem); - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(mnt_userns, dentry, iattr); if (ret) return ret; return iattr->ia_valid & ATTR_SIZE ? bch2_truncate(inode, iattr) - : bch2_setattr_nonsize(inode, iattr); + : bch2_setattr_nonsize(mnt_userns, inode, iattr); } -static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_tmpfile(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index a4207292..ba700810 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -42,24 +42,22 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); } -enum merge_result bch2_reflink_p_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); - struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); + struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); + + /* + * Disabled for now, the triggers code needs to be reworked for merging + * of reflink pointers to work: + */ + return false; if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - bch2_cut_front_s(l.k->p, _r); - return BCH_MERGE_PARTIAL; - } + return false; bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* indirect extents */ @@ -84,6 +82,14 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} + /* indirect inline data */ const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, @@ -138,7 +144,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, /* rewind iter to start of hole, if necessary: */ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k)); + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) goto err; @@ -159,12 +165,6 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (ret) goto err; - r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); - if (IS_ERR(r_p)) { - ret = PTR_ERR(r_p); - goto err; - } - orig->k.type = KEY_TYPE_reflink_p; r_p = bkey_i_to_reflink_p(orig); set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index bfc78561..68c5cb5a 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -5,8 +5,7 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_reflink_p_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index e7b40b3c..8bd7553b 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -323,6 +323,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, } static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) @@ -455,6 +456,7 @@ static int inode_opt_set_fn(struct bch_inode_info *inode, } static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags)