From 72a408f84846fe702b8db4f158b678ee20bbf835 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2019 13:17:42 -0400 Subject: [PATCH] Update bcachefs sources to ece184f718 bcachefs: Reflink --- .bcachefs_revision | 2 +- include/linux/sched/signal.h | 0 libbcachefs/alloc_background.c | 5 +- libbcachefs/bcachefs.h | 4 + libbcachefs/bcachefs_format.h | 26 +- libbcachefs/bkey.h | 4 +- libbcachefs/bkey_methods.c | 8 +- libbcachefs/bkey_methods.h | 2 +- libbcachefs/bset.c | 78 ++- libbcachefs/bset.h | 8 +- libbcachefs/btree_gc.c | 5 +- libbcachefs/btree_iter.c | 120 ++-- libbcachefs/btree_iter.h | 2 +- libbcachefs/btree_types.h | 9 +- libbcachefs/btree_update_interior.c | 21 +- libbcachefs/btree_update_leaf.c | 13 +- libbcachefs/buckets.c | 246 +++++--- libbcachefs/buckets.h | 16 +- libbcachefs/ec.c | 48 +- libbcachefs/extents.c | 317 +++++++---- libbcachefs/extents.h | 126 ++--- libbcachefs/fs-io.c | 847 ++++++++++++++++++---------- libbcachefs/fs-io.h | 19 + libbcachefs/fs.c | 88 ++- libbcachefs/io.c | 213 ++++--- libbcachefs/io.h | 15 +- libbcachefs/io_types.h | 2 + libbcachefs/migrate.c | 16 +- libbcachefs/move.c | 122 ++-- libbcachefs/move.h | 3 +- libbcachefs/movinggc.c | 27 +- libbcachefs/rebalance.c | 6 +- libbcachefs/recovery.c | 30 +- libbcachefs/reflink.c | 300 ++++++++++ libbcachefs/reflink.h | 32 ++ libbcachefs/replicas.c | 1 + 36 files changed, 1894 insertions(+), 887 deletions(-) create mode 100644 include/linux/sched/signal.h create mode 100644 libbcachefs/reflink.c create mode 100644 libbcachefs/reflink.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 31858ae6..fd1cda2a 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -22776fe9902b0b06d6aa18cd4c7f0c5ad35a95fa +ece184f718c2b678738bc2c42906e90eeb8ba7dc diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h new file mode 100644 index 00000000..e69de29b diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 43dc2f27..4cf728ce 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -232,7 +232,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) - bch2_mark_key(c, k, 0, NULL, 0, + bch2_mark_key(c, k, 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); @@ -244,7 +244,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) for_each_journal_key(*journal_keys, j) if (j->btree_id == BTREE_ID_ALLOC) - bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(j->k), + 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 907d1b60..1e601e7b 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -359,6 +359,7 @@ enum gc_phase { GC_PHASE_BTREE_XATTRS, GC_PHASE_BTREE_ALLOC, GC_PHASE_BTREE_QUOTAS, + GC_PHASE_BTREE_REFLINK, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, @@ -746,6 +747,9 @@ struct bch_fs { struct work_struct ec_stripe_delete_work; struct llist_head ec_stripe_delete_list; + /* REFLINK */ + u64 reflink_hint; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 13285936..667170b5 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -336,7 +336,9 @@ static inline void bkey_init(struct bkey *k) x(xattr, 11) \ x(alloc, 12) \ x(quota, 13) \ - x(stripe, 14) + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -891,6 +893,24 @@ struct bch_stripe { struct bch_extent_ptr ptrs[0]; } __attribute__((packed, aligned(8))); +/* Reflink: */ + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + + __le32 reservation_generation; + __u8 nr_replicas; + __u8 pad[3]; +}; + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1293,6 +1313,7 @@ enum bch_sb_features { BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_EC = 4, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, + BCH_FEATURE_REFLINK = 6, BCH_FEATURE_NR, }; @@ -1480,7 +1501,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); x(XATTRS, 3, "xattrs") \ x(ALLOC, 4, "alloc") \ x(QUOTAS, 5, "quotas") \ - x(EC, 6, "erasure_coding") + x(EC, 6, "erasure_coding") \ + x(REFLINK, 7, "reflink") enum btree_id { #define x(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 1acff9d0..5ef66aed 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -50,7 +50,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); } -#define bkey_val_end(_k) vstruct_idx((_k).v, bkey_val_u64s((_k).k)) +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) @@ -552,6 +552,8 @@ BKEY_VAL_ACCESSORS(xattr); BKEY_VAL_ACCESSORS(alloc); BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(stripe); +BKEY_VAL_ACCESSORS(reflink_p); +BKEY_VAL_ACCESSORS(reflink_v); /* byte order helpers */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 27f196ef..6fa6ac1f 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -10,9 +10,10 @@ #include "extents.h" #include "inode.h" #include "quota.h" +#include "reflink.h" #include "xattr.h" -const char * const bch_bkey_types[] = { +const char * const bch2_bkey_types[] = { #define x(name, nr) #name, BCH_BKEY_TYPES() #undef x @@ -159,7 +160,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { - pr_buf(out, "u64s %u type %u ", k->u64s, k->type); + pr_buf(out, "u64s %u type %s ", k->u64s, + bch2_bkey_types[k->type]); bch2_bpos_to_text(out, k->p); @@ -174,8 +176,6 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, if (likely(ops->val_to_text)) ops->val_to_text(out, c, k); - else - pr_buf(out, " %s", bch_bkey_types[k.k->type]); } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 08b97663..e6e97cda 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -9,7 +9,7 @@ struct btree; struct bkey; enum btree_node_type; -extern const char * const bch_bkey_types[]; +extern const char * const bch2_bkey_types[]; enum merge_result { BCH_MERGE_NOMERGE, diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index ef10e77e..32436ed5 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -24,6 +24,16 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); +static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) +{ + unsigned n = ARRAY_SIZE(iter->data); + + while (n && __btree_node_iter_set_end(iter, n - 1)) + --n; + + return n; +} + struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) { unsigned offset = __btree_node_key_to_offset(b, k); @@ -110,7 +120,8 @@ void bch2_dump_btree_node_iter(struct btree *b, { struct btree_node_iter_set *set; - printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets); + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); btree_node_iter_for_each(iter, set) { struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); @@ -119,8 +130,8 @@ void bch2_dump_btree_node_iter(struct btree *b, char buf[100]; bch2_bkey_to_text(&PBUF(buf), &uk); - printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set, - k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf); + printk(KERN_ERR "set %zu key %u: %s\n", + t - b->set, set->k, buf); } } @@ -182,8 +193,12 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, struct btree *b) { struct btree_node_iter_set *set, *s2; + struct bkey_packed *k, *p; struct bset_tree *t; + if (bch2_btree_node_iter_end(iter)) + return; + /* Verify no duplicates: */ btree_node_iter_for_each(iter, set) btree_node_iter_for_each(iter, s2) @@ -204,6 +219,18 @@ found: btree_node_iter_for_each(iter, set) BUG_ON(set != iter->data && btree_node_iter_cmp(b, set[-1], set[0]) > 0); + + k = bch2_btree_node_iter_peek_all(iter, b); + + for_each_bset(b, t) { + if (iter->data[0].end == t->end_offset) + continue; + + p = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + + BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); + } } void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, @@ -1669,25 +1696,13 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter, __bch2_btree_node_iter_advance(iter, b); } -static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -{ - unsigned n = ARRAY_SIZE(iter->data); - - while (n && __btree_node_iter_set_end(iter, n - 1)) - --n; - - return n; -} - /* * Expensive: */ -struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, - struct btree *b, - unsigned min_key_type) +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct btree *b) { struct bkey_packed *k, *prev = NULL; - struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b); struct btree_node_iter_set *set; struct bset_tree *t; unsigned end = 0; @@ -1695,9 +1710,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { - k = bch2_bkey_prev_filter(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t), - min_key_type); + k = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); if (k && (!prev || bkey_iter_cmp(b, k, prev) > 0)) { prev = k; @@ -1706,7 +1720,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite } if (!prev) - goto out; + return NULL; /* * We're manually memmoving instead of just calling sort() to ensure the @@ -1727,18 +1741,20 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; -out: - if (btree_keys_expensive_checks(b)) { - struct btree_node_iter iter2 = *iter; - if (prev) - __bch2_btree_node_iter_advance(&iter2, b); + bch2_btree_node_iter_verify(iter, b); + return prev; +} - while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) { - BUG_ON(k->type >= min_key_type); - __bch2_btree_node_iter_advance(&iter2, b); - } - } +struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, + struct btree *b, + unsigned min_key_type) +{ + struct bkey_packed *prev; + + do { + prev = bch2_btree_node_iter_prev_all(iter, b); + } while (prev && prev->type < min_key_type); return prev; } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 17c23994..643bd9e8 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -528,15 +528,11 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) return ret; } +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, + struct btree *); struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, struct btree *, unsigned); -static inline struct bkey_packed * -bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b) -{ - return bch2_btree_node_iter_prev_filter(iter, b, 0); -} - static inline struct bkey_packed * bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) { diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index a458cfe0..e43d48b8 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -171,7 +171,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, k, k.k->size, NULL, 0, flags); + bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); fsck_err: return ret; } @@ -418,7 +418,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(&d->key), + 0, 0, NULL, 0, BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 8955555d..a28d2dd7 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -86,7 +86,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) struct btree_iter *linked; unsigned readers = 0; - EBUG_ON(btree_node_read_locked(iter, b->level)); + EBUG_ON(!btree_node_intent_locked(iter, b->level)); trans_for_each_iter(iter->trans, linked) if (linked->l[b->level].b == b && @@ -496,6 +496,23 @@ static inline void __bch2_btree_iter_verify(struct btree_iter *iter, #endif +static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t, + struct bkey_packed *k) +{ + struct btree_node_iter_set *set; + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) { + set->k = __btree_node_key_to_offset(b, k); + bch2_btree_node_iter_sort(iter, b); + return; + } + + bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); +} + static void __bch2_btree_node_iter_fix(struct btree_iter *iter, struct btree *b, struct btree_node_iter *node_iter, @@ -527,7 +544,8 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, bch2_btree_node_iter_peek_all(node_iter, b), &iter->k); } - return; + + goto iter_current_key_not_modified; found: set->end = t->end_offset; @@ -569,60 +587,42 @@ found: bkey_disassemble(l->b, k, &iter->k); } iter_current_key_not_modified: - /* - * Interior nodes are special because iterators for interior nodes don't - * obey the usual invariants regarding the iterator position: - * - * We may have whiteouts that compare greater than the iterator - * position, and logically should be in the iterator, but that we - * skipped past to find the first live key greater than the iterator - * position. This becomes an issue when we insert a new key that is - * greater than the current iterator position, but smaller than the - * whiteouts we've already skipped past - this happens in the course of - * a btree split. - * - * We have to rewind the iterator past to before those whiteouts here, - * else bkey_node_iter_prev() is not going to work and who knows what - * else would happen. And we have to do it manually, because here we've - * already done the insert and the iterator is currently inconsistent: - * - * We've got multiple competing invariants, here - we have to be careful - * about rewinding iterators for interior nodes, because they should - * always point to the key for the child node the btree iterator points - * to. + * When a new key is added, and the node iterator now points to that + * key, the iterator might have skipped past deleted keys that should + * come after the key the iterator now points to. We have to rewind to + * before those deleted keys - otherwise bch2_btree_node_iter_prev_all() + * breaks: */ - if (b->level && new_u64s && - btree_iter_pos_cmp(iter, b, where) > 0) { + if (!bch2_btree_node_iter_end(node_iter) && + (b->level || + (iter->flags & BTREE_ITER_IS_EXTENTS))) { struct bset_tree *t; - struct bkey_packed *k; + struct bkey_packed *k, *k2, *p; + + k = bch2_btree_node_iter_peek_all(node_iter, b); for_each_bset(b, t) { - if (bch2_bkey_to_bset(b, where) == t) + bool set_pos = false; + + if (node_iter->data[0].end == t->end_offset) continue; - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(node_iter, b, t)); - if (k && - bkey_iter_cmp(b, k, where) > 0) { - struct btree_node_iter_set *set; - unsigned offset = - __btree_node_key_to_offset(b, bkey_next(k)); + k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); - btree_node_iter_for_each(node_iter, set) - if (set->k == offset) { - set->k = __btree_node_key_to_offset(b, k); - bch2_btree_node_iter_sort(node_iter, b); - goto next_bset; - } - - bch2_btree_node_iter_push(node_iter, b, k, - btree_bkey_last(b, t)); + while ((p = bch2_bkey_prev_all(b, t, k2)) && + bkey_iter_cmp(b, k, p) < 0) { + k2 = p; + set_pos = true; } -next_bset: - t = t; + + if (set_pos) + btree_node_iter_set_set_pos(node_iter, + b, t, k2); } } + + bch2_btree_node_iter_verify(node_iter, b); } void bch2_btree_node_iter_fix(struct btree_iter *iter, @@ -1436,8 +1436,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) recheck: while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && - bkey_deleted(k.k) && - bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) + bkey_cmp(k.k->p, iter->pos) <= 0) bch2_btree_node_iter_advance(&l->iter, l->b); /* @@ -1477,6 +1476,8 @@ recheck: EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); EBUG_ON(bkey_deleted(k.k)); iter->uptodate = BTREE_ITER_UPTODATE; + + __bch2_btree_iter_verify(iter, l->b); return k; } @@ -1507,6 +1508,8 @@ recheck: iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; + + __bch2_btree_iter_verify(iter, l->b); return (struct bkey_s_c) { &iter->k, NULL }; } @@ -1539,19 +1542,18 @@ recheck: goto recheck; } - if (k.k && - !bkey_deleted(k.k) && - !bkey_cmp(iter->pos, k.k->p)) { - iter->uptodate = BTREE_ITER_UPTODATE; - return k; - } else { + if (!k.k || + bkey_deleted(k.k) || + bkey_cmp(iter->pos, k.k->p)) { /* hole */ bkey_init(&iter->k); iter->k.p = iter->pos; - - iter->uptodate = BTREE_ITER_UPTODATE; - return (struct bkey_s_c) { &iter->k, NULL }; + k = (struct bkey_s_c) { &iter->k, NULL }; } + + iter->uptodate = BTREE_ITER_UPTODATE; + __bch2_btree_iter_verify(iter, l->b); + return k; } struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) @@ -1779,6 +1781,12 @@ found: iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + + if ((iter->flags & BTREE_ITER_INTENT) && + !bch2_btree_iter_upgrade(iter, 1)) { + trace_trans_restart_upgrade(trans->ip); + return ERR_PTR(-EINTR); + } } BUG_ON(iter->btree_id != btree_id); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 9483ec89..249df21b 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -242,7 +242,7 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, (_start), (_flags))) ?: \ PTR_ERR_OR_ZERO(((_k) = \ __bch2_btree_iter_peek(_iter, _flags)).k); \ - !ret && (_k).k; \ + !_ret && (_k).k; \ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ __bch2_btree_iter_next(_iter, _flags)).k)) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 91aa30a6..f4e1bfe1 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -461,7 +461,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return type == BKEY_TYPE_EXTENTS; + switch (type) { + case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_REFLINK: + return true; + default: + return false; + } } static inline bool btree_node_is_extents(struct btree *b) @@ -477,6 +483,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) case BKEY_TYPE_EXTENTS: case BKEY_TYPE_INODES: case BKEY_TYPE_EC: + case BKEY_TYPE_REFLINK: return true; default: return false; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 92941377..6813eddd 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -194,7 +194,7 @@ found: : gc_pos_btree_root(as->btree_id)) >= 0 && gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE| BCH_BUCKET_MARK_GC); } @@ -266,11 +266,12 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, { BUG_ON(!pending->index_update_done); - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, - BCH_BUCKET_MARK_OVERWRITE); + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE); if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE| BCH_BUCKET_MARK_GC); } @@ -1077,11 +1078,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - 0, fs_usage, 0, + 0, 0, fs_usage, 0, BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_INSERT| BCH_BUCKET_MARK_GC); @@ -1175,12 +1176,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - 0, fs_usage, 0, + 0, 0, fs_usage, 0, BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_node(b))) bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_INSERT| BCH_BUCKET_MARK_GC); @@ -2003,11 +2004,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - 0, fs_usage, 0, + 0, 0, fs_usage, 0, BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_INSERT|| BCH_BUCKET_MARK_GC); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 4f12108b..906e4999 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -400,8 +400,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->iter->level); BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - !bch2_extent_is_atomic(i->k, i->iter)); - + bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && !(trans->flags & BTREE_INSERT_ATOMIC)); } @@ -522,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans, { return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES); + i->iter->btree_id == BTREE_ID_INODES || + i->iter->btree_id == BTREE_ID_REFLINK); } static inline bool update_has_triggers(struct btree_trans *trans, @@ -923,8 +923,6 @@ out_noupdates: bch2_trans_unlink_iters(trans, ~trans->iters_touched| trans->iters_unlink_on_commit); trans->iters_touched = 0; - } else { - bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit); } trans->nr_updates = 0; trans->mem_top = 0; @@ -1033,7 +1031,10 @@ retry: /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); - bch2_extent_trim_atomic(&delete, iter); + + ret = bch2_extent_trim_atomic(&delete, iter); + if (ret) + break; } bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 81c3c313..d6dcbf91 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -405,7 +405,8 @@ int bch2_fs_usage_apply(struct bch_fs *c, */ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); if (WARN_ONCE(should_not_have_added > 0, - "disk usage increased without a reservation")) { + "disk usage increased by %lli without a reservation", + should_not_have_added)) { atomic64_sub(should_not_have_added, &c->sectors_available); added -= should_not_have_added; ret = -1; @@ -810,23 +811,24 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, } static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, - s64 delta) + unsigned offset, s64 delta, + unsigned flags) { - if (delta > 0) { - /* - * marking a new extent, which _will have size_ @delta - * - * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE - * case, we haven't actually created the key we'll be inserting - * yet (for the split) - so we don't want to be using - * k->size/crc.live_size here: - */ - return __ptr_disk_sectors(p, delta); - } else { - BUG_ON(-delta > p.crc.live_size); + if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { + BUG_ON(offset + -delta > p.crc.live_size); - return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) - - (s64) ptr_disk_sectors(p); + return -((s64) ptr_disk_sectors(p)) + + __ptr_disk_sectors(p, offset) + + __ptr_disk_sectors(p, p.crc.live_size - + offset + delta); + } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { + BUG_ON(offset + -delta > p.crc.live_size); + + return -((s64) ptr_disk_sectors(p)) + + __ptr_disk_sectors(p, p.crc.live_size + + delta); + } else { + return ptr_disk_sectors(p); } } @@ -970,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", (u64) p.idx); - return -1; + return -EIO; } BUG_ON(m->r.e.data_type != data_type); @@ -1005,7 +1007,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, } static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, enum bch_data_type data_type, + unsigned offset, s64 sectors, + enum bch_data_type data_type, struct bch_fs_usage *fs_usage, unsigned journal_seq, unsigned flags) { @@ -1026,7 +1029,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = data_type == BCH_DATA_BTREE ? sectors - : ptr_disk_sectors_delta(p, sectors); + : ptr_disk_sectors_delta(p, offset, sectors, flags); bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, fs_usage, journal_seq, flags); @@ -1115,7 +1118,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, } int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c k, s64 sectors, + struct bkey_s_c k, + unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { @@ -1136,11 +1140,12 @@ int bch2_mark_key_locked(struct bch_fs *c, ? c->opts.btree_node_size : -c->opts.btree_node_size; - ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE, + ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: - ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER, + case KEY_TYPE_reflink_v: + ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, fs_usage, journal_seq, flags); break; case KEY_TYPE_stripe: @@ -1171,14 +1176,14 @@ int bch2_mark_key_locked(struct bch_fs *c, } int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, + unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { int ret; percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(c, k, sectors, + ret = bch2_mark_key_locked(c, k, offset, sectors, fs_usage, journal_seq, flags); percpu_up_read(&c->mark_lock); @@ -1194,8 +1199,11 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree *b = iter->l[0].b; + unsigned offset = 0; s64 sectors = 0; + flags |= BCH_BUCKET_MARK_OVERWRITE; + if (btree_node_is_extents(b) ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 : bkey_cmp(new->k.p, old.k->p)) @@ -1204,35 +1212,33 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, if (btree_node_is_extents(b)) { switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: + offset = 0; sectors = -((s64) old.k->size); break; case BCH_EXTENT_OVERLAP_BACK: + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); sectors = bkey_start_offset(&new->k) - old.k->p.offset; break; case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; sectors = bkey_start_offset(old.k) - new->k.p.offset; break; case BCH_EXTENT_OVERLAP_MIDDLE: - sectors = old.k->p.offset - new->k.p.offset; - BUG_ON(sectors <= 0); - - bch2_mark_key_locked(c, old, sectors, - fs_usage, trans->journal_res.seq, - BCH_BUCKET_MARK_INSERT|flags); - - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); + flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; break; } BUG_ON(sectors >= 0); } - return bch2_mark_key_locked(c, old, sectors, fs_usage, - trans->journal_res.seq, - BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1; + return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, + trans->journal_res.seq, flags) ?: 1; } int bch2_mark_update(struct btree_trans *trans, @@ -1252,8 +1258,7 @@ int bch2_mark_update(struct btree_trans *trans, if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT)) bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), - bpos_min(insert->k->k.p, b->key.k.p).offset - - bkey_start_offset(&insert->k->k), + 0, insert->k->k.size, fs_usage, trans->journal_res.seq, BCH_BUCKET_MARK_INSERT|flags); @@ -1300,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, xchg(&warned_disk_usage, 1)) return; - pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + bch_err(c, "disk usage increased more than %llu sectors reserved", + disk_res_sectors); trans_for_each_update_iter(trans, i) { struct btree_iter *iter = i->iter; @@ -1315,7 +1321,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, node_iter = iter->l[0].iter; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k; @@ -1341,15 +1347,20 @@ static int trans_get_key(struct btree_trans *trans, struct btree_iter **iter, struct bkey_s_c *k) { - unsigned i; + struct btree_insert_entry *i; int ret; - for (i = 0; i < trans->nr_updates; i++) - if (!trans->updates[i].deferred && - trans->updates[i].iter->btree_id == btree_id && - !bkey_cmp(pos, trans->updates[i].iter->pos)) { - *iter = trans->updates[i].iter; - *k = bkey_i_to_s_c(trans->updates[i].k); + for (i = trans->updates; + i < trans->updates + trans->nr_updates; + i++) + if (!i->deferred && + i->iter->btree_id == btree_id && + (btree_node_type_is_extents(btree_id) + ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && + bkey_cmp(pos, i->k->k.p) < 0 + : !bkey_cmp(pos, i->iter->pos))) { + *iter = i->iter; + *k = bkey_i_to_s_c(i->k); return 0; } @@ -1358,6 +1369,8 @@ static int trans_get_key(struct btree_trans *trans, if (IS_ERR(*iter)) return PTR_ERR(*iter); + bch2_trans_iter_free_on_commit(trans, *iter); + *k = bch2_btree_iter_peek_slot(*iter); ret = bkey_err(*k); if (ret) @@ -1460,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, s64 sectors, enum bch_data_type data_type) { + struct bch_fs *c = trans->c; struct bch_replicas_padded r; struct btree_iter *iter; struct bkey_i *new_k; @@ -1476,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, return ret; if (k.k->type != KEY_TYPE_stripe) { - bch_err_ratelimited(trans->c, - "pointer to nonexistent stripe %llu", - (u64) p.idx); - ret = -1; + bch2_fs_inconsistent(c, + "pointer to nonexistent stripe %llu", + (u64) p.idx); + ret = -EIO; goto out; } @@ -1511,8 +1525,9 @@ out: } static int bch2_trans_mark_extent(struct btree_trans *trans, - struct bkey_s_c k, - s64 sectors, enum bch_data_type data_type) + struct bkey_s_c k, unsigned offset, + s64 sectors, unsigned flags, + enum bch_data_type data_type) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1532,7 +1547,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = data_type == BCH_DATA_BTREE ? sectors - : ptr_disk_sectors_delta(p, sectors); + : ptr_disk_sectors_delta(p, offset, sectors, flags); ret = bch2_trans_mark_pointer(trans, p, disk_sectors, data_type); @@ -1566,8 +1581,86 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } -int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, +static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_i *new_k; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_REFLINK, + POS(0, idx), &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_reflink_v) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); + ret = -EIO; + goto err; + } + + if ((flags & BCH_BUCKET_MARK_OVERWRITE) && + (bkey_start_offset(k.k) < idx || + k.k->p.offset > idx + sectors)) + goto out; + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + new_k = trans_update_key(trans, iter, k.k->u64s); + ret = PTR_ERR_OR_ZERO(new_k); + if (ret) + goto err; + + bkey_reassemble(new_k, k); + r_v = bkey_i_to_reflink_v(new_k); + + le64_add_cpu(&r_v->v.refcount, + !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); + + if (!r_v->v.refcount) { + r_v->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&r_v->k, 0); + } +out: + ret = k.k->p.offset - idx; +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, unsigned offset, s64 sectors, unsigned flags) +{ + u64 idx = le64_to_cpu(p.v->idx) + offset; + s64 ret = 0; + + sectors = abs(sectors); + BUG_ON(offset + sectors > p.k->size); + + while (sectors) { + ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); + if (ret < 0) + break; + + idx += ret; + sectors = max_t(s64, 0LL, sectors - ret); + ret = 0; + } + + return ret; +} + +int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + unsigned offset, s64 sectors, unsigned flags) { struct replicas_delta_list *d; struct bch_fs *c = trans->c; @@ -1578,11 +1671,12 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ? c->opts.btree_node_size : -c->opts.btree_node_size; - return bch2_trans_mark_extent(trans, k, sectors, - BCH_DATA_BTREE); + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_BTREE); case KEY_TYPE_extent: - return bch2_trans_mark_extent(trans, k, sectors, - BCH_DATA_USER); + case KEY_TYPE_reflink_v: + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_USER); case KEY_TYPE_inode: d = replicas_deltas_realloc(trans, 0); @@ -1604,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, d->fs_usage.persistent_reserved[replicas - 1] += sectors; return 0; } + case KEY_TYPE_reflink_p: + return bch2_trans_mark_reflink_p(trans, + bkey_s_c_to_reflink_p(k), + offset, sectors, flags); default: return 0; } @@ -1621,11 +1719,8 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(insert), - bpos_min(insert->k.p, b->key.k.p).offset - - bkey_start_offset(&insert->k), - BCH_BUCKET_MARK_INSERT); + ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), + 0, insert->k.size, BCH_BUCKET_MARK_INSERT); if (ret) return ret; @@ -1633,7 +1728,9 @@ int bch2_trans_mark_update(struct btree_trans *trans, KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k; + unsigned offset = 0; s64 sectors = 0; + unsigned flags = BCH_BUCKET_MARK_OVERWRITE; k = bkey_disassemble(b, _k, &unpacked); @@ -1645,35 +1742,32 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (btree_node_is_extents(b)) { switch (bch2_extent_overlap(&insert->k, k.k)) { case BCH_EXTENT_OVERLAP_ALL: + offset = 0; sectors = -((s64) k.k->size); break; case BCH_EXTENT_OVERLAP_BACK: + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); sectors = bkey_start_offset(&insert->k) - k.k->p.offset; break; case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; sectors = bkey_start_offset(k.k) - insert->k.p.offset; break; case BCH_EXTENT_OVERLAP_MIDDLE: - sectors = k.k->p.offset - insert->k.p.offset; - BUG_ON(sectors <= 0); - - ret = bch2_trans_mark_key(trans, k, sectors, - BCH_BUCKET_MARK_INSERT); - if (ret) - return ret; - - sectors = bkey_start_offset(&insert->k) - - k.k->p.offset; + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + sectors = -((s64) insert->k.size); + flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; break; } BUG_ON(sectors >= 0); } - ret = bch2_trans_mark_key(trans, k, sectors, - BCH_BUCKET_MARK_OVERWRITE); + ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); if (ret) return ret; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 5ab6f3d3..799bfb3c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -251,14 +251,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, #define BCH_BUCKET_MARK_INSERT (1 << 0) #define BCH_BUCKET_MARK_OVERWRITE (1 << 1) -#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 2) -#define BCH_BUCKET_MARK_GC (1 << 3) -#define BCH_BUCKET_MARK_ALLOC_READ (1 << 4) -#define BCH_BUCKET_MARK_NOATOMIC (1 << 5) +#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2) +#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3) +#define BCH_BUCKET_MARK_GC (1 << 4) +#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5) +#define BCH_BUCKET_MARK_NOATOMIC (1 << 6) -int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64, +int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, +int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, unsigned); @@ -272,7 +273,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, void bch2_replicas_delta_list_apply(struct bch_fs *, struct bch_fs_usage *, struct replicas_delta_list *); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned); +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 848f5dcb..bdb18c2a 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -162,19 +162,20 @@ static int extent_matches_stripe(struct bch_fs *c, struct bch_stripe *v, struct bkey_s_c k) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - int idx; - if (!bkey_extent_is_data(k.k)) - return -1; + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + int idx; - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr(e, ptr) { - idx = ptr_matches_stripe(c, v, ptr); - if (idx >= 0) - return idx; + extent_for_each_ptr(e, ptr) { + idx = ptr_matches_stripe(c, v, ptr); + if (idx >= 0) + return idx; + } + break; + } } return -1; @@ -182,19 +183,20 @@ static int extent_matches_stripe(struct bch_fs *c, static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) { - struct bkey_s_c_extent e; - const union bch_extent_entry *entry; + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; - if (!bkey_extent_is_data(k.k)) - return false; + extent_for_each_entry(e, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; - e = bkey_s_c_to_extent(k); - - extent_for_each_entry(e, entry) - if (extent_entry_type(entry) == - BCH_EXTENT_ENTRY_stripe_ptr && - entry->stripe_ptr.idx == idx) - return true; + break; + } + } return false; } @@ -1310,7 +1312,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) break; } - bch2_mark_key(c, k, 0, NULL, 0, + bch2_mark_key(c, k, 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); } diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index e286048b..5c6bae55 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -250,6 +250,33 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + /* extent specific utility code */ const struct bch_extent_ptr * @@ -280,20 +307,6 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group return NULL; } -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return ptr; - - return NULL; -} - unsigned bch2_extent_is_compressed(struct bkey_s_c k) { unsigned ret = 0; @@ -314,16 +327,17 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k) return ret; } -bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, - struct bch_extent_ptr m, u64 offset) +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev == m.dev && p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == (s64) m.offset - offset) return true; @@ -390,16 +404,17 @@ static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, bch2_csum_type_is_encryption(n.csum_type); } -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; if (!n.csum_type) return false; - extent_for_each_crc(e, crc, i) + bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) return true; @@ -415,9 +430,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ -bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked n) +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; struct extent_ptr_decoded p; union bch_extent_entry *i; @@ -425,7 +440,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { - extent_for_each_crc(extent_i_to_s(e), u, i) + bkey_for_each_crc(&k->k, ptrs, u, i) if (!u.compression_type && u.csum_type && u.live_size == u.uncompressed_size) { @@ -437,15 +452,15 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, found: BUG_ON(n.compression_type); BUG_ON(n.offset); - BUG_ON(n.live_size != e->k.size); + BUG_ON(n.live_size != k->k.size); restart_narrow_pointers: - extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr); + bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; - bch2_extent_ptr_decoded_append(e, &p); + bch2_extent_ptr_decoded_append(k, &p); ret = true; goto restart_narrow_pointers; } @@ -708,44 +723,48 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, /* Extents */ -bool __bch2_cut_front(struct bpos where, struct bkey_s k) +void __bch2_cut_front(struct bpos where, struct bkey_s k) { - u64 len = 0; + u64 sub; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return false; + return; EBUG_ON(bkey_cmp(where, k.k->p) > 0); - len = k.k->p.offset - where.offset; + sub = where.offset - bkey_start_offset(k.k); - BUG_ON(len > k.k->size); + k.k->size -= sub; - /* - * Don't readjust offset if the key size is now 0, because that could - * cause offset to point to the next bucket: - */ - if (!len) + if (!k.k->size) k.k->type = KEY_TYPE_deleted; - else if (bkey_extent_is_data(k.k)) { - struct bkey_s_extent e = bkey_s_to_extent(k); + + switch (k.k->type) { + case KEY_TYPE_deleted: + case KEY_TYPE_discard: + case KEY_TYPE_error: + case KEY_TYPE_cookie: + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false; - extent_for_each_entry(e, entry) { + bkey_extent_entry_for_each(ptrs, entry) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: if (!seen_crc) - entry->ptr.offset += e.k->size - len; + entry->ptr.offset += sub; break; case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += e.k->size - len; + entry->crc32.offset += sub; break; case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += e.k->size - len; + entry->crc64.offset += sub; break; case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += e.k->size - len; + entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: break; @@ -754,11 +773,20 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k) if (extent_entry_is_crc(entry)) seen_crc = true; } + + break; } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - k.k->size = len; - - return true; + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_reservation: + break; + default: + BUG(); + } } bool bch2_cut_back(struct bpos where, struct bkey *k) @@ -772,8 +800,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) len = where.offset - bkey_start_offset(k); - BUG_ON(len > k->size); - k->p = where; k->size = len; @@ -897,6 +923,16 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) return; + /* + * may have skipped past some deleted extents greater than the insert + * key, before we got to a non deleted extent and knew we could bail out + * rewind the iterator a bit if necessary: + */ + node_iter = l->iter; + while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && + bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0) + l->iter = node_iter; + k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); bch2_bset_insert(l->b, &l->iter, k, insert, 0); @@ -921,47 +957,131 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return ret; } -static inline struct bpos -bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter) +static int __bch2_extent_atomic_end(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters) +{ + int ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + return 0; + } + + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = end->offset - bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + + *nr_iters += 1; + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; + + *end = bpos_min(*end, pos); + break; + } + } + + bch2_trans_iter_put(trans, iter); + break; + } + } + + return ret; +} + +int bch2_extent_atomic_end(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) { struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; struct bkey_packed *_k; - unsigned nr_alloc_ptrs = + unsigned nr_iters = bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert)); + int ret = 0; BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + *end = bpos_min(insert->k.p, b->key.k.p); + + ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert), + 0, end, &nr_iters, 10); + if (ret) + return ret; + + while (nr_iters < 20 && + (_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + unsigned offset = 0; - if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) break; - nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k); + if (bkey_cmp(bkey_start_pos(&insert->k), + bkey_start_pos(k.k)) > 0) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); - if (nr_alloc_ptrs > 20) { - BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0); - return bpos_min(insert->k.p, k.k->p); - } + ret = __bch2_extent_atomic_end(trans, k, offset, + end, &nr_iters, 20); + if (ret) + return ret; + + if (nr_iters >= 20) + break; bch2_btree_node_iter_advance(&node_iter, b); } - return bpos_min(insert->k.p, b->key.k.p); + return 0; } -void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) { - bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k); + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter->trans, iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, &k->k); + return 0; } -bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) { - return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p); + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter->trans, iter, k, &end); + if (ret) + return ret; + + return !bkey_cmp(end, k->k.p); } enum btree_insert_ret @@ -1185,19 +1305,6 @@ next: overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; } - - /* - * may have skipped past some deleted extents greater than the insert - * key, before we got to a non deleted extent and knew we could bail out - * rewind the iterator a bit if necessary: - */ - { - struct btree_node_iter node_iter = l->iter; - - while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && - bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0) - l->iter = node_iter; - } } /** @@ -1394,9 +1501,12 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, #undef set_common_fields } -static void bch2_extent_crc_init(union bch_extent_crc *crc, - struct bch_extent_crc_unpacked new) +static void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + if (bch_crc_bytes[new.csum_type] <= 4 && new.uncompressed_size - 1 <= CRC32_SIZE_MAX && new.nonce <= CRC32_NONCE_MAX) @@ -1413,54 +1523,53 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc, BUG(); bch2_extent_crc_pack(crc, new); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); } -void bch2_extent_crc_append(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked new) -{ - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); -} - -static inline void __extent_entry_insert(struct bkey_i_extent *e, +static inline void __extent_entry_insert(struct bkey_i *k, union bch_extent_entry *dst, union bch_extent_entry *new) { - union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), dst, (u64 *) end - (u64 *) dst); - e->k.u64s += extent_entry_u64s(new); + k->k.u64s += extent_entry_u64s(new); memcpy(dst, new, extent_entry_bytes(new)); } -void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, +void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct extent_ptr_decoded *p) { - struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); union bch_extent_entry *pos; unsigned i; if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = e->v.start; + pos = ptrs.start; goto found; } - extent_for_each_crc(extent_i_to_s(e), crc, pos) + bkey_for_each_crc(&k->k, ptrs, crc, pos) if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = extent_entry_next(pos); goto found; } - bch2_extent_crc_append(e, p->crc); - pos = extent_entry_last(extent_i_to_s(e)); + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ptr)); + __extent_entry_insert(k, pos, to_entry(&p->ptr)); for (i = 0; i < p->ec_nr; i++) { p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + __extent_entry_insert(k, pos, to_entry(&p->ec[i])); } } @@ -1487,17 +1596,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return false; } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && @@ -1508,7 +1617,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, } if (extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index fe927373..6fddbace 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -12,7 +12,8 @@ struct btree_insert_entry; /* extent entries: */ -#define extent_entry_last(_e) bkey_val_end(_e) +#define extent_entry_last(_e) \ + ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) #define entry_to_ptr(_entry) \ ({ \ @@ -258,6 +259,27 @@ out: \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ _ptr, _entry) +#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +({ \ + __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ + if (extent_entry_is_crc(_iter)) { \ + (_crc) = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_iter)); \ + break; \ + } \ + \ + (_iter) < (_end); \ +}) + +#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ + for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ + (_iter) = (_start); \ + bkey_crc_next(_k, _start, _end, _crc, _iter); \ + (_iter) = extent_entry_next(_iter)) + +#define bkey_for_each_crc(_k, _p, _crc, _iter) \ + __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) + /* utility code common to all keys with pointers: */ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) @@ -267,7 +289,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); return (struct bkey_ptrs_c) { to_entry(&e.v->start[0]), - to_entry(bkey_val_end(e)) + to_entry(extent_entry_last(e)) }; } case KEY_TYPE_extent: { @@ -284,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) to_entry(&s.v->ptrs[s.v->nr_blocks]), }; } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } default: return (struct bkey_ptrs_c) { NULL, NULL }; } @@ -337,18 +367,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - if (ptr->dev == dev) - return ptr; - - return NULL; -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); @@ -359,6 +377,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *); +void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); +void bch2_bkey_drop_device(struct bkey_s, unsigned); +const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); @@ -410,8 +433,10 @@ enum merge_result bch2_reservation_merge(struct bch_fs *, .key_merge = bch2_reservation_merge, \ } -void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bpos *); +int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, @@ -419,52 +444,46 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, void bch2_insert_fixup_extent(struct btree_trans *, struct btree_insert_entry *); -void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, - unsigned, unsigned); +void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned); unsigned bch2_extent_is_compressed(struct bkey_s_c); -bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, - struct bch_extent_ptr, u64); +bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_extent_ptr, u64); static inline bool bkey_extent_is_data(const struct bkey *k) { switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_extent: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; } } +/* + * Should extent be counted under inode->i_sectors? + */ static inline bool bkey_extent_is_allocation(const struct bkey *k) { switch (k->type) { case KEY_TYPE_extent: case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; } } -static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k) -{ - return bkey_extent_is_allocation(k.k) && - !bch2_extent_is_compressed(k); -} - -void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -void bch2_bkey_drop_device(struct bkey_s, unsigned); - /* Extent entry iteration: */ #define extent_for_each_entry_from(_e, _entry, _start) \ @@ -480,45 +499,16 @@ void bch2_bkey_drop_device(struct bkey_s, unsigned); #define extent_for_each_ptr(_e, _ptr) \ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) -#define extent_crc_next(_e, _crc, _iter) \ -({ \ - extent_for_each_entry_from(_e, _iter, _iter) \ - if (extent_entry_is_crc(_iter)) { \ - (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\ - break; \ - } \ - \ - (_iter) < extent_entry_last(_e); \ -}) - -#define extent_for_each_crc(_e, _crc, _iter) \ - for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ - (_iter) = (_e).v->start; \ - extent_crc_next(_e, _crc, _iter); \ - (_iter) = extent_entry_next(_iter)) - #define extent_for_each_ptr_decode(_e, _ptr, _entry) \ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ extent_entry_last(_e), _ptr, _entry) -void bch2_extent_crc_append(struct bkey_i_extent *, - struct bch_extent_crc_unpacked); -void bch2_extent_ptr_decoded_append(struct bkey_i_extent *, +void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -static inline void __extent_entry_push(struct bkey_i_extent *e) -{ - union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); - - EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > - BKEY_EXTENT_VAL_U64s_MAX); - - e->k.u64s += extent_entry_u64s(entry); -} - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c, struct bch_extent_crc_unpacked); -bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -540,11 +530,11 @@ do { \ } \ } while (0) -bool __bch2_cut_front(struct bpos, struct bkey_s); +void __bch2_cut_front(struct bpos, struct bkey_s); -static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k) +static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) { - return __bch2_cut_front(where, bkey_i_to_s(k)); + __bch2_cut_front(where, bkey_i_to_s(k)); } bool bch2_cut_back(struct bpos, struct bkey *); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 5d0c2b69..d8113b29 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -16,6 +16,7 @@ #include "io.h" #include "keylist.h" #include "quota.h" +#include "reflink.h" #include #include @@ -193,9 +194,9 @@ static int inode_set_size(struct bch_inode_info *inode, return 0; } -static int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) { struct inode_new_size s = { .new_size = new_size, @@ -277,16 +278,16 @@ static int sum_sector_overwrites(struct btree_trans *trans, return 0; } -static int bch2_extent_update(struct btree_trans *trans, - struct bch_inode_info *inode, - struct disk_reservation *disk_res, - struct quota_res *quota_res, - struct btree_iter *extent_iter, - struct bkey_i *k, - u64 new_i_size, - bool may_allocate, - bool direct, - s64 *total_delta) +int bch2_extent_update(struct btree_trans *trans, + struct bch_inode_info *inode, + struct disk_reservation *disk_res, + struct quota_res *quota_res, + struct btree_iter *extent_iter, + struct bkey_i *k, + u64 new_i_size, + bool may_allocate, + bool direct, + s64 *total_delta) { struct bch_fs *c = trans->c; struct btree_iter *inode_iter = NULL; @@ -298,13 +299,13 @@ static int bch2_extent_update(struct btree_trans *trans, s64 i_sectors_delta; int ret; - bch2_trans_begin_updates(trans); - ret = bch2_btree_iter_traverse(extent_iter); if (ret) return ret; - bch2_extent_trim_atomic(k, extent_iter); + ret = bch2_extent_trim_atomic(k, extent_iter); + if (ret) + return ret; ret = sum_sector_overwrites(trans, extent_iter, k, &allocating, @@ -448,6 +449,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop) bkey_copy(&tmp.k, bch2_keylist_front(keys)); + bch2_trans_begin_updates(&trans); + ret = bch2_extent_update(&trans, inode, &wop->res, quota_res, iter, &tmp.k, @@ -511,13 +514,14 @@ struct bch_page_sector { /* i_sectors: */ enum { SECTOR_UNALLOCATED, - SECTOR_QUOTA_RESERVED, + SECTOR_RESERVED, SECTOR_DIRTY, SECTOR_ALLOCATED, } state:2; }; struct bch_page_state { + atomic_t write_count; struct bch_page_sector s[PAGE_SECTORS]; }; @@ -588,31 +592,6 @@ static struct bch_page_state *bch2_page_state_create(struct page *page, return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); } -static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page) -{ - struct bch_page_state *s = bch2_page_state(page); - struct disk_reservation disk_res = { 0 }; - struct quota_res quota_res = { 0 }; - unsigned i; - - if (!s) - return; - - for (i = 0; i < ARRAY_SIZE(s->s); i++) { - disk_res.sectors += s->s[i].replicas_reserved; - s->s[i].replicas_reserved = 0; - - if (s->s[i].state == SECTOR_QUOTA_RESERVED) { - quota_res.sectors++; - s->s[i].state = SECTOR_UNALLOCATED; - } - } - - bch2_quota_reservation_put(c, inode, "a_res); - bch2_disk_reservation_put(c, &disk_res); -} - static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) { /* XXX: this should not be open coded */ @@ -663,98 +642,134 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c, return 0; } -static int bch2_get_page_quota_reservation(struct bch_fs *c, +struct bch2_page_reservation { + struct disk_reservation disk; + struct quota_res quota; +}; + +static void bch2_page_reservation_init(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page, bool check_enospc) + struct bch2_page_reservation *res) +{ + memset(res, 0, sizeof(*res)); + + res->disk.nr_replicas = inode_nr_replicas(c, inode); +} + +static void bch2_page_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_page_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +static int bch2_page_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, struct page *page, + struct bch2_page_reservation *res, + unsigned offset, unsigned len, bool check_enospc) { struct bch_page_state *s = bch2_page_state_create(page, 0); - struct quota_res quota_res = { 0 }; - unsigned i, quota_res_sectors = 0; + unsigned i, disk_sectors = 0, quota_sectors = 0; int ret; if (!s) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(s->s); i++) - quota_res_sectors += s->s[i].state == SECTOR_UNALLOCATED; + for (i = offset / 512; + i < DIV_ROUND_UP(offset + len, 512); + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; + } - if (!quota_res_sectors) - return 0; + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, + disk_sectors, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + } - ret = bch2_quota_reservation_add(c, inode, "a_res, - quota_res_sectors, - check_enospc); - if (unlikely(ret)) - return ret; + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, + check_enospc); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; - for (i = 0; i < ARRAY_SIZE(s->s); i++) - if (s->s[i].state == SECTOR_UNALLOCATED) - s->s[i].state = SECTOR_QUOTA_RESERVED; + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } + } return 0; } -static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page, bool check_enospc) -{ - return bch2_get_page_disk_reservation(c, inode, page, check_enospc) ?: - bch2_get_page_quota_reservation(c, inode, page, check_enospc); -} - static void bch2_clear_page_bits(struct page *page) { struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_page_state *s = bch2_page_state(page); + struct disk_reservation disk_res = { 0 }; int i, dirty_sectors = 0; if (!s) return; for (i = 0; i < ARRAY_SIZE(s->s); i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + if (s->s[i].state == SECTOR_DIRTY) { dirty_sectors++; s->s[i].state = SECTOR_UNALLOCATED; } } + bch2_disk_reservation_put(c, &disk_res); + if (dirty_sectors) i_sectors_acct(c, inode, NULL, -dirty_sectors); - bch2_put_page_reservation(c, inode, page); bch2_page_state_release(page); } -static void __bch2_set_page_dirty(struct page *page) +static void bch2_set_page_dirty(struct bch_fs *c, + struct bch_inode_info *inode, struct page *page, + struct bch2_page_reservation *res, + unsigned offset, unsigned len) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_page_state *s = bch2_page_state(page); - struct quota_res quota_res = { 0 }; unsigned i, dirty_sectors = 0; - BUG_ON(!s); + for (i = offset / 512; + i < DIV_ROUND_UP(offset + len, 512); + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); - for (i = 0; i < ARRAY_SIZE(s->s); i++) { - if (s->s[i].state == SECTOR_QUOTA_RESERVED) - quota_res.sectors++; + BUG_ON(sectors > res->disk.sectors); + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; - if (s->s[i].state == SECTOR_UNALLOCATED || - s->s[i].state == SECTOR_QUOTA_RESERVED) { - s->s[i].state = SECTOR_DIRTY; + if (s->s[i].state == SECTOR_UNALLOCATED) dirty_sectors++; - } + + s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); } if (dirty_sectors) - i_sectors_acct(c, inode, "a_res, dirty_sectors); - bch2_quota_reservation_put(c, inode, "a_res); -} + i_sectors_acct(c, inode, &res->quota, dirty_sectors); -static void bch2_set_page_dirty(struct page *page) -{ - __bch2_set_page_dirty(page); - __set_page_dirty_nobuffers(page); + if (!PageDirty(page)) + __set_page_dirty_nobuffers(page); } vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) @@ -764,8 +779,11 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_page_reservation res; int ret = VM_FAULT_LOCKED; + bch2_page_reservation_init(c, inode, &res); + sb_start_pagefault(inode->v.i_sb); file_update_time(file); @@ -786,19 +804,22 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) goto out; } - if (bch2_get_page_reservation(c, inode, page, true)) { + if (bch2_page_reservation_get(c, inode, page, &res, + 0, PAGE_SIZE, true)) { unlock_page(page); ret = VM_FAULT_SIGBUS; goto out; } - if (!PageDirty(page)) - bch2_set_page_dirty(page); + bch2_set_page_dirty(c, inode, page, &res, 0, PAGE_SIZE); wait_for_stable_page(page); out: if (current->pagecache_lock != &mapping->add_lock) pagecache_add_put(&mapping->add_lock); sb_end_pagefault(inode->v.i_sb); + + bch2_page_reservation_put(c, inode, &res); + return ret; } @@ -857,31 +878,6 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, } #endif -/* readpages/writepages: */ - -static bool bio_can_add_page_contig(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; - - return bio->bi_vcnt < bio->bi_max_vecs && - bio_end_sector(bio) == offset; -} - -static int bio_add_page_contig(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; - - EBUG_ON(!bio->bi_max_vecs); - - if (!bio->bi_vcnt) - bio->bi_iter.bi_sector = offset; - else if (!bio_can_add_page_contig(bio, page)) - return -1; - - BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); - return 0; -} - /* readpage(s): */ static void bch2_readpages_end_io(struct bio *bio) @@ -991,11 +987,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); - - BUG_ON(bio->bi_iter.bi_sector < bkey_start_offset(k.k)); - BUG_ON(bio_end_sector(bio) > k.k->p.offset); - + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_allocated(k); + unsigned state = k.k->type == KEY_TYPE_reservation + ? SECTOR_RESERVED + : SECTOR_ALLOCATED; bio_for_each_segment(bv, bio, iter) { struct bch_page_state *s = bch2_page_state(bv.bv_page); @@ -1005,16 +1001,17 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) i < (bv.bv_offset + bv.bv_len) >> 9; i++) { s->s[i].nr_replicas = nr_ptrs; - s->s[i].state = SECTOR_ALLOCATED; + s->s[i].state = state; } } } static void readpage_bio_extend(struct readpages_iter *iter, - struct bio *bio, u64 offset, + struct bio *bio, + unsigned sectors_this_extent, bool get_more) { - while (bio_end_sector(bio) < offset && + while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; struct page *page = readpage_iter_next(iter); @@ -1062,71 +1059,82 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; - struct bio *bio = &rbio->bio; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + int ret = 0; rbio->c = c; rbio->start_time = local_clock(); - +retry: while (1) { BKEY_PADDED(k) tmp; struct bkey_s_c k; - unsigned bytes; + unsigned bytes, sectors, offset_into_extent; - bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector)); + bch2_btree_iter_set_pos(iter, + POS(inum, rbio->bio.bi_iter.bi_sector)); k = bch2_btree_iter_peek_slot(iter); - BUG_ON(!k.k); - - if (IS_ERR(k.k)) { - int ret = btree_iter_err(iter); - BUG_ON(!ret); - bcache_io_error(c, bio, "btree IO error %i", ret); - bio_endio(bio); - return; - } + ret = bkey_err(k); + if (ret) + break; bkey_reassemble(&tmp.k, k); - bch2_trans_unlock(trans); k = bkey_i_to_s_c(&tmp.k); + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(trans); + if (readpages_iter) { bool want_full_extent = false; if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *i; struct extent_ptr_decoded p; - extent_for_each_ptr_decode(e, p, i) + bkey_for_each_ptr_decode(k.k, ptrs, p, i) want_full_extent |= ((p.crc.csum_type != 0) | (p.crc.compression_type != 0)); } - readpage_bio_extend(readpages_iter, - bio, k.k->p.offset, - want_full_extent); + readpage_bio_extend(readpages_iter, &rbio->bio, + sectors, want_full_extent); } - bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) - - bio->bi_iter.bi_sector) << 9; - swap(bio->bi_iter.bi_size, bytes); + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); - if (bytes == bio->bi_iter.bi_size) + if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; if (bkey_extent_is_allocation(k.k)) - bch2_add_page_sectors(bio, k); + bch2_add_page_sectors(&rbio->bio, k); - bch2_read_extent(c, rbio, k, flags); + bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) return; - swap(bio->bi_iter.bi_size, bytes); - bio_advance(bio, bytes); + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); } + + if (ret == -EINTR) + goto retry; + + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); } int bch2_readpages(struct file *file, struct address_space *mapping, @@ -1191,7 +1199,9 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, bch2_page_state_create(page, __GFP_NOFAIL); bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); - bio_add_page_contig(&rbio->bio, page); + rbio->bio.bi_iter.bi_sector = + (sector_t) page->index << PAGE_SECTOR_SHIFT; + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, @@ -1277,12 +1287,20 @@ static void bch2_writepage_io_done(struct closure *cl) struct bio *bio = &io->op.op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i; + unsigned i, j; if (io->op.op.error) { bio_for_each_segment_all(bvec, bio, i, iter) { + struct bch_page_state *s; + SetPageError(bvec->bv_page); mapping_set_error(bvec->bv_page->mapping, -EIO); + + lock_page(bvec->bv_page); + s = bch2_page_state(bvec->bv_page); + for (j = 0; j < PAGE_SECTORS; j++) + s->s[j].nr_replicas = 0; + unlock_page(bvec->bv_page); } } @@ -1307,8 +1325,12 @@ static void bch2_writepage_io_done(struct closure *cl) i_sectors_acct(c, io->op.inode, NULL, io->op.sectors_added - (s64) io->new_sectors); - bio_for_each_segment_all(bvec, bio, i, iter) - end_page_writeback(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter) { + struct bch_page_state *s = __bch2_page_state(bvec->bv_page); + + if (atomic_dec_and_test(&s->write_count)) + end_page_writeback(bvec->bv_page); + } closure_return_with_destructor(&io->cl, bch2_writepage_io_free); } @@ -1329,11 +1351,10 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) static void bch2_writepage_io_alloc(struct bch_fs *c, struct bch_writepage_state *w, struct bch_inode_info *inode, - struct page *page, + u64 sector, unsigned nr_replicas) { struct bch_write_op *op; - u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT; w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, @@ -1347,8 +1368,8 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->pos = POS(inode->v.i_ino, offset); - op->wbio.bio.bi_iter.bi_sector = offset; + op->pos = POS(inode->v.i_ino, sector); + op->wbio.bio.bi_iter.bi_sector = sector; } static int __bch2_writepage(struct page *page, @@ -1358,12 +1379,10 @@ static int __bch2_writepage(struct page *page, struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; - struct bch_page_state *s; - unsigned offset, nr_replicas_this_write = U32_MAX; - unsigned dirty_sectors = 0, reserved_sectors = 0; + struct bch_page_state *s, orig; + unsigned i, offset, nr_replicas_this_write = U32_MAX; loff_t i_size = i_size_read(&inode->v); pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned i; int ret; EBUG_ON(!PageUptodate(page)); @@ -1398,48 +1417,90 @@ do_io: return 0; } - for (i = 0; i < PAGE_SECTORS; i++) + /* Before unlocking the page, get copy of reservations: */ + orig = *s; + + for (i = 0; i < PAGE_SECTORS; i++) { + if (s->s[i].state < SECTOR_DIRTY) + continue; + nr_replicas_this_write = min_t(unsigned, nr_replicas_this_write, s->s[i].nr_replicas + s->s[i].replicas_reserved); - - /* Before unlocking the page, transfer reservation to w->io: */ + } for (i = 0; i < PAGE_SECTORS; i++) { + if (s->s[i].state < SECTOR_DIRTY) + continue; + s->s[i].nr_replicas = w->opts.compression ? 0 : nr_replicas_this_write; - reserved_sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; - - dirty_sectors += s->s[i].state == SECTOR_DIRTY; s->s[i].state = SECTOR_ALLOCATED; } + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); + BUG_ON(PageWriteback(page)); set_page_writeback(page); + unlock_page(page); - if (w->io && - (w->io->op.op.res.nr_replicas != nr_replicas_this_write || - !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page))) - bch2_writepage_do_io(w); + offset = 0; + while (1) { + unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; - if (!w->io) - bch2_writepage_io_alloc(c, w, inode, page, - nr_replicas_this_write); + while (offset < PAGE_SECTORS && + orig.s[offset].state < SECTOR_DIRTY) + offset++; - w->io->new_sectors += dirty_sectors; + if (offset == PAGE_SECTORS) + break; - BUG_ON(inode != w->io->op.inode); - BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page)); + sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; - w->io->op.op.res.sectors += reserved_sectors; - w->io->op.new_i_size = i_size; + while (offset + sectors < PAGE_SECTORS && + orig.s[offset + sectors].state >= SECTOR_DIRTY) + sectors++; - if (wbc->sync_mode == WB_SYNC_ALL) - w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; + for (i = offset; i < offset + sectors; i++) { + reserved_sectors += orig.s[i].replicas_reserved; + dirty_sectors += orig.s[i].state == SECTOR_DIRTY; + } + + if (w->io && + (w->io->op.op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.op.wbio.bio) || + bio_end_sector(&w->io->op.op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + + if (!w->io) + bch2_writepage_io_alloc(c, w, inode, sector, + nr_replicas_this_write); + + w->io->new_sectors += dirty_sectors; + + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->op.inode); + BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page, + sectors << 9, offset << 9)); + + w->io->op.op.res.sectors += reserved_sectors; + w->io->op.new_i_size = i_size; + + if (wbc->sync_mode == WB_SYNC_ALL) + w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; + + offset += sectors; + } + + if (atomic_dec_and_test(&s->write_count)) + end_page_writeback(page); return 0; } @@ -1482,12 +1543,18 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_page_reservation *res; pgoff_t index = pos >> PAGE_SHIFT; unsigned offset = pos & (PAGE_SIZE - 1); struct page *page; int ret = -ENOMEM; - BUG_ON(inode_unhashed(&inode->v)); + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + bch2_page_reservation_init(c, inode, res); + *fsdata = res; /* Not strictly necessary - same reason as mkwrite(): */ pagecache_add_get(&mapping->add_lock); @@ -1519,7 +1586,8 @@ readpage: if (ret) goto err; out: - ret = bch2_get_page_reservation(c, inode, page, true); + ret = bch2_page_reservation_get(c, inode, page, res, + offset, len, true); if (ret) { if (!PageUptodate(page)) { /* @@ -1542,6 +1610,8 @@ err: *pagep = NULL; err_unlock: pagecache_add_put(&mapping->add_lock); + kfree(res); + *fsdata = NULL; return ret; } @@ -1551,6 +1621,8 @@ int bch2_write_end(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_page_reservation *res = fsdata; + unsigned offset = pos & (PAGE_SIZE - 1); lockdep_assert_held(&inode->v.i_rwsem); @@ -1573,18 +1645,19 @@ int bch2_write_end(struct file *file, struct address_space *mapping, if (copied) { if (!PageUptodate(page)) SetPageUptodate(page); - if (!PageDirty(page)) - bch2_set_page_dirty(page); + + bch2_set_page_dirty(c, inode, page, res, offset, copied); inode->ei_last_dirtied = (unsigned long) current; - } else { - bch2_put_page_reservation(c, inode, page); } unlock_page(page); put_page(page); pagecache_add_put(&mapping->add_lock); + bch2_page_reservation_put(c, inode, res); + kfree(res); + return copied; } @@ -1597,15 +1670,19 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct page *pages[WRITE_BATCH_PAGES]; + struct bch2_page_reservation res; unsigned long index = pos >> PAGE_SHIFT; unsigned offset = pos & (PAGE_SIZE - 1); unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - unsigned i, copied = 0, nr_pages_copied = 0; + unsigned i, reserved = 0, set_dirty = 0; + unsigned copied = 0, nr_pages_copied = 0; int ret = 0; BUG_ON(!len); BUG_ON(nr_pages > ARRAY_SIZE(pages)); + bch2_page_reservation_init(c, inode, &res); + for (i = 0; i < nr_pages; i++) { pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); if (!pages[i]) { @@ -1632,19 +1709,25 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } } - for (i = 0; i < nr_pages; i++) { - ret = bch2_get_page_reservation(c, inode, pages[i], true); + while (reserved < len) { + struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; + unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); + unsigned pg_len = min_t(unsigned, len - reserved, + PAGE_SIZE - pg_offset); +retry_reservation: + ret = bch2_page_reservation_get(c, inode, page, &res, + pg_offset, pg_len, true); - if (ret && !PageUptodate(pages[i])) { - ret = bch2_read_single_page(pages[i], mapping); - if (ret) - goto out; - - ret = bch2_get_page_reservation(c, inode, pages[i], true); + if (ret && !PageUptodate(page)) { + ret = bch2_read_single_page(page, mapping); + if (!ret) + goto retry_reservation; } if (ret) goto out; + + reserved += pg_len; } if (mapping_writably_mapped(mapping)) @@ -1654,10 +1737,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, while (copied < len) { struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); - unsigned pg_bytes = min_t(unsigned, len - copied, - PAGE_SIZE - pg_offset); + unsigned pg_len = min_t(unsigned, len - copied, + PAGE_SIZE - pg_offset); unsigned pg_copied = iov_iter_copy_from_user_atomic(page, - iter, pg_offset, pg_bytes); + iter, pg_offset, pg_len); if (!pg_copied) break; @@ -1687,22 +1770,29 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, copied -= (offset + copied) & (PAGE_SIZE - 1); } } + + while (set_dirty < copied) { + struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; + unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); + unsigned pg_len = min_t(unsigned, copied - set_dirty, + PAGE_SIZE - pg_offset); + + if (!PageUptodate(page)) + SetPageUptodate(page); + + bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); + unlock_page(page); + put_page(page); + + set_dirty += pg_len; + } out: - for (i = 0; i < nr_pages_copied; i++) { - if (!PageUptodate(pages[i])) - SetPageUptodate(pages[i]); - if (!PageDirty(pages[i])) - bch2_set_page_dirty(pages[i]); + for (i = nr_pages_copied; i < nr_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); } - for (i = nr_pages_copied; i < nr_pages; i++) { - if (!PageDirty(pages[i])) - bch2_put_page_reservation(c, inode, pages[i]); - unlock_page(pages[i]); - put_page(pages[i]); - } + bch2_page_reservation_put(c, inode, &res); return copied ?: ret; } @@ -2186,29 +2276,25 @@ out: /* truncate: */ -static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, - u64 start_offset, u64 end_offset, u64 *journal_seq) +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bch_inode_info *inode, + u64 new_i_size) { - struct bpos start = POS(inode->v.i_ino, start_offset); - struct bpos end = POS(inode->v.i_ino, end_offset); + struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct btree_trans trans; - struct btree_iter *iter; struct bkey_s_c k; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, - BTREE_ITER_INTENT); + int ret = 0, ret2 = 0; while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k)) && bkey_cmp(iter->pos, end) < 0) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + ret = bkey_err(k); + if (ret) + goto btree_err; + bkey_init(&delete.k); delete.k.p = iter->pos; @@ -2216,21 +2302,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); - ret = bch2_extent_update(&trans, inode, - &disk_res, NULL, iter, &delete, - 0, true, true, NULL); - bch2_disk_reservation_put(c, &disk_res); + bch2_trans_begin_updates(trans); - if (ret == -EINTR) + ret = bch2_extent_update(trans, inode, + &disk_res, NULL, iter, &delete, + new_i_size, false, true, NULL); + bch2_disk_reservation_put(c, &disk_res); +btree_err: + if (ret == -EINTR) { + ret2 = ret; ret = 0; + } if (ret) break; - - bch2_trans_cond_resched(&trans); } + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); + } + + return ret ?: ret2; +} + +static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, + u64 start_offset, u64 end_offset) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, start_offset), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, + POS(inode->v.i_ino, end_offset), + inode, 0); + bch2_trans_exit(&trans); + if (ret == -EINTR) + ret = 0; + return ret; } @@ -2263,8 +2379,10 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; + struct bch_page_state *s; unsigned start_offset = start & (PAGE_SIZE - 1); unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + unsigned i; struct page *page; int ret = 0; @@ -2296,31 +2414,42 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, } } + s = bch2_page_state_create(page, 0); + if (!s) { + ret = -ENOMEM; + goto unlock; + } + if (!PageUptodate(page)) { ret = bch2_read_single_page(page, mapping); if (ret) goto unlock; } + if (index != start >> PAGE_SHIFT) + start_offset = 0; + if (index != end >> PAGE_SHIFT) + end_offset = PAGE_SIZE; + + for (i = round_up(start_offset, block_bytes(c)) >> 9; + i < round_down(end_offset, block_bytes(c)) >> 9; + i++) { + s->s[i].nr_replicas = 0; + s->s[i].state = SECTOR_UNALLOCATED; + } + + zero_user_segment(page, start_offset, end_offset); + /* * Bit of a hack - we don't want truncate to fail due to -ENOSPC. * * XXX: because we aren't currently tracking whether the page has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - ret = bch2_get_page_reservation(c, inode, page, false); + ret = bch2_get_page_disk_reservation(c, inode, page, false); BUG_ON(ret); - if (index == start >> PAGE_SHIFT && - index == end >> PAGE_SHIFT) - zero_user_segment(page, start_offset, end_offset); - else if (index == start >> PAGE_SHIFT) - zero_user_segment(page, start_offset, PAGE_SIZE); - else if (index == end >> PAGE_SHIFT) - zero_user_segment(page, 0, end_offset); - - if (!PageDirty(page)) - bch2_set_page_dirty(page); + __set_page_dirty_nobuffers(page); unlock: unlock_page(page); put_page(page); @@ -2331,7 +2460,7 @@ out: static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) { return __bch2_truncate_page(inode, from >> PAGE_SHIFT, - from, from + PAGE_SIZE); + from, round_up(from, PAGE_SIZE)); } static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) @@ -2422,13 +2551,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) truncate_setsize(&inode->v, iattr->ia_size); - /* - * XXX: need a comment explaining why PAGE_SIZE and not block_bytes() - * here: - */ ret = __bch2_fpunch(c, inode, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - U64_MAX, &inode->ei_journal_seq); + round_up(iattr->ia_size, block_bytes(c)) >> 9, + U64_MAX); if (unlikely(ret)) goto err; @@ -2449,8 +2574,8 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; - u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; + u64 discard_start = round_up(offset, block_bytes(c)) >> 9; + u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; int ret = 0; inode_lock(&inode->v); @@ -2475,8 +2600,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); if (discard_start < discard_end) - ret = __bch2_fpunch(c, inode, discard_start, discard_end, - &inode->ei_journal_seq); + ret = __bch2_fpunch(c, inode, discard_start, discard_end); err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); @@ -2535,7 +2659,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, while (bkey_cmp(dst->pos, POS(inode->v.i_ino, - round_up(new_size, PAGE_SIZE) >> 9)) < 0) { + round_up(new_size, block_bytes(c)) >> 9)) < 0) { struct disk_reservation disk_res; ret = bch2_btree_iter_traverse(dst); @@ -2554,7 +2678,9 @@ static long bch2_fcollapse(struct bch_inode_info *inode, bch2_cut_front(src->pos, ©.k); copy.k.k.p.offset -= len >> 9; - bch2_extent_trim_atomic(©.k, dst); + ret = bch2_extent_trim_atomic(©.k, dst); + if (ret) + goto bkey_err; BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k))); @@ -2563,6 +2689,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode, BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); + bch2_trans_begin_updates(&trans); + ret = bch2_extent_update(&trans, inode, &disk_res, NULL, dst, ©.k, @@ -2584,7 +2712,7 @@ bkey_err: ret = __bch2_fpunch(c, inode, round_up(new_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq); + U64_MAX); if (ret) goto err; @@ -2608,8 +2736,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, struct btree_trans trans; struct btree_iter *iter; struct bpos end_pos; - loff_t block_start, block_end; - loff_t end = offset + len; + loff_t end = offset + len; + loff_t block_start = round_down(offset, block_bytes(c)); + loff_t block_end = round_up(end, block_bytes(c)); unsigned sectors; unsigned replicas = io_opts(c, inode).data_replicas; int ret; @@ -2641,12 +2770,6 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, goto err; truncate_pagecache_range(&inode->v, offset, end - 1); - - block_start = round_up(offset, PAGE_SIZE); - block_end = round_down(end, PAGE_SIZE); - } else { - block_start = round_down(offset, PAGE_SIZE); - block_end = round_up(end, PAGE_SIZE); } iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -2706,6 +2829,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } + bch2_trans_begin_updates(&trans); + ret = bch2_extent_update(&trans, inode, &disk_res, "a_res, iter, &reservation.k_i, @@ -2770,42 +2895,148 @@ long bch2_fallocate_dispatch(struct file *file, int mode, return -EOPNOTSUPP; } +static void mark_range_unallocated(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SHIFT; + struct pagevec pvec; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct bch_page_state *s; + + lock_page(page); + s = bch2_page_state(page); + + if (s) + for (j = 0; j < PAGE_SECTORS; j++) + s->s[j].nr_replicas = 0; + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); +} + +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +{ + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + loff_t ret = 0; + loff_t aligned_len; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if ((pos_src & (block_bytes(c) - 1)) || + (pos_dst & (block_bytes(c) - 1))) + return -EINVAL; + + if (src == dst && + abs(pos_src - pos_dst) < len) + return -EINVAL; + + bch2_lock_inodes(INODE_LOCK, src, dst); + + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + + __pagecache_block_get(&src->v.i_mapping->add_lock); + __pagecache_block_get(&dst->v.i_mapping->add_lock); + + ret = generic_remap_file_range_prep(file_src, pos_src, + file_dst, pos_dst, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + aligned_len = round_up(len, block_bytes(c)); + + ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + aligned_len); + if (ret) + goto out_unlock; + + mark_range_unallocated(src, pos_src, pos_src + aligned_len); + + ret = bch2_remap_range(c, dst, + POS(dst->v.i_ino, pos_dst >> 9), + POS(src->v.i_ino, pos_src >> 9), + aligned_len >> 9, + pos_dst + len); + if (ret > 0) + ret = min(ret << 9, len); + +out_unlock: + __pagecache_block_put(&dst->v.i_mapping->add_lock); + __pagecache_block_put(&src->v.i_mapping->add_lock); + + bch2_unlock_inodes(INODE_LOCK, src, dst); + + return ret; +} + /* fseek: */ -static bool page_is_data(struct page *page) +static int page_data_offset(struct page *page, unsigned offset) { struct bch_page_state *s = bch2_page_state(page); unsigned i; - if (!s) - return false; + if (s) + for (i = offset >> 9; i < PAGE_SECTORS; i++) + if (s->s[i].state >= SECTOR_DIRTY) + return i << 9; - for (i = 0; i < PAGE_SECTORS; i++) - if (s->s[i].state >= SECTOR_DIRTY) - return true; - - return false; + return -1; } -static loff_t bch2_next_pagecache_data(struct inode *vinode, +static loff_t bch2_seek_pagecache_data(struct inode *vinode, loff_t start_offset, loff_t end_offset) { struct address_space *mapping = vinode->i_mapping; struct page *page; - pgoff_t index; + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + loff_t ret; + int offset; - for (index = start_offset >> PAGE_SHIFT; - index < end_offset >> PAGE_SHIFT; - index++) { - if (find_get_pages(mapping, &index, 1, &page)) { + while (index <= end_index) { + if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { lock_page(page); - if (page_is_data(page)) - end_offset = - min(end_offset, - max(start_offset, - ((loff_t) index) << PAGE_SHIFT)); + offset = page_data_offset(page, + page->index == start_index + ? start_offset & (PAGE_SIZE - 1) + : 0); + if (offset >= 0) { + ret = clamp(((loff_t) page->index << PAGE_SHIFT) + + offset, + start_offset, end_offset); + unlock_page(page); + put_page(page); + return ret; + } + unlock_page(page); put_page(page); } else { @@ -2848,43 +3079,65 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return ret; if (next_data > offset) - next_data = bch2_next_pagecache_data(&inode->v, + next_data = bch2_seek_pagecache_data(&inode->v, offset, next_data); - if (next_data > isize) + if (next_data >= isize) return -ENXIO; return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } -static bool page_slot_is_data(struct address_space *mapping, pgoff_t index) +static int __page_hole_offset(struct page *page, unsigned offset) { + struct bch_page_state *s = bch2_page_state(page); + unsigned i; + + if (!s) + return 0; + + for (i = offset >> 9; i < PAGE_SECTORS; i++) + if (s->s[i].state < SECTOR_DIRTY) + return i << 9; + + return -1; +} + +static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) +{ + pgoff_t index = offset >> PAGE_SHIFT; struct page *page; - bool ret; + int pg_offset; + loff_t ret = -1; page = find_lock_entry(mapping, index); if (!page || xa_is_value(page)) - return false; + return offset; + + pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); + if (pg_offset >= 0) + ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; - ret = page_is_data(page); unlock_page(page); return ret; } -static loff_t bch2_next_pagecache_hole(struct inode *vinode, +static loff_t bch2_seek_pagecache_hole(struct inode *vinode, loff_t start_offset, loff_t end_offset) { struct address_space *mapping = vinode->i_mapping; - pgoff_t index; + loff_t offset = start_offset, hole; - for (index = start_offset >> PAGE_SHIFT; - index < end_offset >> PAGE_SHIFT; - index++) - if (!page_slot_is_data(mapping, index)) - end_offset = max(start_offset, - ((loff_t) index) << PAGE_SHIFT); + while (offset < end_offset) { + hole = page_hole_offset(mapping, offset); + if (hole >= 0 && hole <= end_offset) + return max(start_offset, hole); + + offset += PAGE_SIZE; + offset &= PAGE_MASK; + } return end_offset; } @@ -2909,11 +3162,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) POS(inode->v.i_ino, offset >> 9), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_next_pagecache_hole(&inode->v, + next_hole = bch2_seek_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE); break; } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_next_pagecache_hole(&inode->v, + next_hole = bch2_seek_pagecache_hole(&inode->v, max(offset, bkey_start_offset(k.k) << 9), k.k->p.offset << 9); diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index 2b3ac496..a3573232 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -9,6 +9,22 @@ #include +struct quota_res; + +int bch2_extent_update(struct btree_trans *, + struct bch_inode_info *, + struct disk_reservation *, + struct quota_res *, + struct btree_iter *, + struct bkey_i *, + u64, bool, bool, s64 *); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_inode_info *, u64); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); + int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); @@ -30,6 +46,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); +loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, + loff_t, loff_t, unsigned); + loff_t bch2_llseek(struct file *, loff_t, int); vm_fault_t bch2_page_mkwrite(struct vm_fault *); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index b1d23e3f..a35f34eb 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1068,16 +1068,20 @@ static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) return 0; } -static int bch2_fill_extent(struct fiemap_extent_info *info, - const struct bkey_i *k, unsigned flags) +static int bch2_fill_extent(struct bch_fs *c, + struct fiemap_extent_info *info, + struct bkey_s_c k, unsigned flags) { - if (bkey_extent_is_data(&k->k)) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + if (bkey_extent_is_data(k.k)) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; int ret; - extent_for_each_ptr_decode(e, p, entry) { + if (k.k->type == KEY_TYPE_reflink_v) + flags |= FIEMAP_EXTENT_SHARED; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int flags2 = 0; u64 offset = p.ptr.offset; @@ -1086,23 +1090,23 @@ static int bch2_fill_extent(struct fiemap_extent_info *info, else offset += p.crc.offset; - if ((offset & (PAGE_SECTORS - 1)) || - (e.k->size & (PAGE_SECTORS - 1))) + if ((offset & (c->opts.block_size - 1)) || + (k.k->size & (c->opts.block_size - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, - bkey_start_offset(e.k) << 9, + bkey_start_offset(k.k) << 9, offset << 9, - e.k->size << 9, flags|flags2); + k.k->size << 9, flags|flags2); if (ret) return ret; } return 0; - } else if (k->k.type == KEY_TYPE_reservation) { + } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, - bkey_start_offset(&k->k) << 9, - 0, k->k.size << 9, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); @@ -1119,7 +1123,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(k) tmp; + BKEY_PADDED(k) cur, prev; + unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; @@ -1128,27 +1133,58 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(ei->v.i_ino, start >> 9), 0, k, ret) + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(ei->v.i_ino, start >> 9), + BTREE_ITER_SLOTS); + + while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) { + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bkey_reassemble(&cur.k, k); + k = bkey_i_to_s_c(&cur.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &cur.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + offset_into_extent), + &cur.k); + bch2_key_resize(&cur.k.k, sectors); + cur.k.k.p.offset = iter->pos.offset + cur.k.k.size; + if (bkey_extent_is_data(k.k) || k.k->type == KEY_TYPE_reservation) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(ei->v.i_ino, (start + len) >> 9)) >= 0) - break; - if (have_extent) { - ret = bch2_fill_extent(info, &tmp.k, 0); + ret = bch2_fill_extent(c, info, + bkey_i_to_s_c(&prev.k), 0); if (ret) break; } - bkey_reassemble(&tmp.k, k); + bkey_copy(&prev.k, &cur.k); have_extent = true; } - if (!ret && have_extent) - ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); + bch2_btree_iter_set_pos(iter, + POS(iter->pos.inode, + iter->pos.offset + sectors)); + } + if (!ret && have_extent) + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), + FIEMAP_EXTENT_LAST); +err: ret = bch2_trans_exit(&trans) ?: ret; return ret < 0 ? ret : 0; } @@ -1196,6 +1232,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1712,9 +1749,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, goto out; } - /* XXX: blocksize */ - sb->s_blocksize = PAGE_SIZE; - sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_blocksize = block_bytes(c); + sb->s_blocksize_bits = ilog2(block_bytes(c)); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &bch_super_operations; sb->s_export_op = &bch_export_ops; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 4d81b6e6..c5d9a0c5 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -259,6 +259,8 @@ int bch2_write_index_default(struct bch_write_op *op) bch2_verify_keylist_sorted(keys); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); +retry: + bch2_trans_begin(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(&bch2_keylist_front(keys)->k), @@ -269,7 +271,9 @@ int bch2_write_index_default(struct bch_write_op *op) bkey_copy(&split.k, bch2_keylist_front(keys)); - bch2_extent_trim_atomic(&split.k, iter); + ret = bch2_extent_trim_atomic(&split.k, iter); + if (ret) + break; bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k)); @@ -286,6 +290,11 @@ int bch2_write_index_default(struct bch_write_op *op) bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); + if (ret == -EINTR) { + ret = 0; + goto retry; + } + bch2_trans_exit(&trans); return ret; @@ -426,7 +435,7 @@ static void init_append_extent(struct bch_write_op *op, p.ptr.cached = !ca->mi.durability || (op->flags & BCH_WRITE_CACHED) != 0; p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; - bch2_extent_ptr_decoded_append(e, &p); + bch2_extent_ptr_decoded_append(&e->k_i, &p); BUG_ON(crc.compressed_size > ob->sectors_free); ob->sectors_free -= crc.compressed_size; @@ -954,17 +963,13 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts opts, unsigned flags) { - if (!bkey_extent_is_data(k.k)) - return false; - if (!(flags & BCH_READ_MAY_PROMOTE)) return false; if (!opts.promote_target) return false; - if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), - opts.promote_target)) + if (bch2_bkey_has_target(c, k, opts.promote_target)) return false; if (bch2_target_congested(c, opts.promote_target)) { @@ -1028,6 +1033,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, @@ -1084,6 +1090,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (struct data_opts) { .target = opts.promote_target }, + btree_id, bkey_s_c_null); BUG_ON(ret); @@ -1121,7 +1128,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); + promote = __promote_alloc(c, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_REFLINK + : BTREE_ID_EXTENTS, + pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1222,17 +1233,16 @@ retry: k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - if (!bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset)) { + if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset)) { /* extent we wanted to read no longer exists: */ rbio->hole = true; goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) @@ -1255,26 +1265,40 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; + + bch2_trans_init(&trans, c, 0, 0); retry: + bch2_trans_begin(&trans); + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; - unsigned bytes; + unsigned bytes, sectors, offset_into_extent; bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + bch2_trans_unlock(&trans); - bytes = min_t(unsigned, bvec_iter.bi_size, - (k.k->p.offset - bvec_iter.bi_sector) << 9); + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, + offset_into_extent, failed, flags); switch (ret) { case READ_RETRY: goto retry; @@ -1355,7 +1379,6 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_i_extent *e; BKEY_PADDED(k) new; struct bch_extent_crc_unpacked new_crc; u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; @@ -1374,34 +1397,30 @@ retry: if (IS_ERR_OR_NULL(k.k)) goto out; - if (!bkey_extent_is_data(k.k)) - goto out; - bkey_reassemble(&new.k, k); - e = bkey_i_to_extent(&new.k); + k = bkey_i_to_s_c(&new.k); - if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), - rbio->pick.ptr, data_offset) || - bversion_cmp(e->k.version, rbio->version)) + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; /* Extent was merged? */ - if (bkey_start_offset(&e->k) < data_offset || - e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size) + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(&e->k) - data_offset, e->k.size, + bkey_start_offset(k.k) - data_offset, k.k->size, rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); goto out; } - if (!bch2_extent_narrow_crcs(e, new_crc)) + if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) goto out; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i)); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k)); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -1412,15 +1431,6 @@ out: bch2_trans_exit(&trans); } -static bool should_narrow_crcs(struct bkey_s_c k, - struct extent_ptr_decoded *pick, - unsigned flags) -{ - return !(flags & BCH_READ_IN_RETRY) && - bkey_extent_is_data(k.k) && - bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc); -} - /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { @@ -1455,7 +1465,7 @@ static void __bch2_read_endio(struct work_struct *work) goto nodecode; /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset; + crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); if (crc.compression_type != BCH_COMPRESSION_NONE) { @@ -1564,8 +1574,51 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } +int bch2_read_indirect_extent(struct btree_trans *trans, + struct btree_iter *extent_iter, + unsigned *offset_into_extent, + struct bkey_i *orig_k) +{ + struct btree_iter *iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + if (orig_k->k.type != KEY_TYPE_reflink_p) + return 0; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + + *offset_into_extent; + + iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + POS(0, reflink_offset), + BTREE_ITER_SLOTS, 1); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v) { + __bcache_io_error(trans->c, + "pointer to nonexistent indirect extent"); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + bkey_reassemble(orig_k, k); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, + unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { struct extent_ptr_decoded pick; @@ -1598,7 +1651,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) goto hole; - iter.bi_sector = pos.offset; iter.bi_size = pick.crc.compressed_size << 9; goto noclone; } @@ -1607,13 +1659,13 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_MUST_CLONE; - narrow_crcs = should_narrow_crcs(k, &pick, flags); + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || - k.k->p.offset < bvec_iter_end_sector(iter)); + BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (pick.crc.compression_type != BCH_COMPRESSION_NONE || (pick.crc.csum_type != BCH_CSUM_NONE && @@ -1634,15 +1686,17 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || bvec_iter_sectors(iter) != pick.crc.live_size || pick.crc.offset || - iter.bi_sector != pos.offset)); + offset_into_extent)); + pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + - (iter.bi_sector - pos.offset); + offset_into_extent; + offset_into_extent = 0; pick.crc.compressed_size = bvec_iter_sectors(iter); pick.crc.uncompressed_size = bvec_iter_sectors(iter); pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); - pos.offset = iter.bi_sector; + offset_into_extent = 0; } if (rbio) { @@ -1697,6 +1751,7 @@ noclone: else rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); rbio->narrow_crcs = narrow_crcs; @@ -1815,45 +1870,67 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_SLOTS); + + while (1) { BKEY_PADDED(k) tmp; - unsigned bytes; + unsigned bytes, sectors, offset_into_extent; + + bch2_btree_iter_set_pos(iter, + POS(inode, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + goto err; + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); /* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, - (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(c, rbio, k, flags); + bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) - return; + break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - +out: bch2_trans_exit(&trans); + return; +err: + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); bch2_rbio_done(rbio); + goto out; } void bch2_fs_io_exit(struct bch_fs *c) diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 1e8470af..7db3bd0e 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -95,9 +95,8 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; -int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c, struct bch_io_failures *, unsigned); -void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); +int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *, + unsigned *, struct bkey_i *); enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, @@ -112,14 +111,22 @@ enum bch_read_flags { BCH_READ_IN_RETRY = 1 << 7, }; +int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, + struct bvec_iter, struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + static inline void bch2_read_extent(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags); + __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, + offset_into_extent, NULL, flags); } +void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); + static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_io_opts opts) { diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 04f6d9a7..2d397e5e 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -38,6 +38,8 @@ struct bch_read_bio { */ struct bvec_iter bvec_iter; + unsigned offset_into_extent; + u16 flags; union { struct { diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index ad41f5e3..dc3b03d6 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, + enum btree_id btree_id) { struct btree_trans trans; struct btree_iter *iter; @@ -44,13 +45,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, + BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { - if (!bkey_extent_is_data(k.k) || - !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) { + if (!bch2_bkey_has_device(k, dev_idx)) { ret = bch2_mark_bkey_replicas(c, k); if (ret) break; @@ -99,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: + __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); +} + static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index e7e58afe..9595ba79 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -64,13 +64,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - struct bkey_i_extent *insert, *new = + struct bkey_i *insert; + struct bkey_i_extent *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; const union bch_extent_entry *entry; @@ -83,32 +84,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op) break; if (bversion_cmp(k.k->version, new->k.version) || - !bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k), - m->ptr, m->offset)) + !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) goto nomatch; if (m->data_cmd == DATA_REWRITE && - !bch2_extent_has_device(bkey_s_c_to_extent(k), - m->data_opts.rewrite_dev)) + !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) goto nomatch; bkey_reassemble(&_insert.k, k); - insert = bkey_i_to_extent(&_insert.k); + insert = &_insert.k; bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); - bch2_cut_front(iter->pos, &insert->k_i); + bch2_cut_front(iter->pos, insert); bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); if (m->data_cmd == DATA_REWRITE) - bch2_bkey_drop_device(extent_i_to_s(insert).s, + bch2_bkey_drop_device(bkey_i_to_s(insert), m->data_opts.rewrite_dev); extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -124,18 +122,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op) if (!did_work) goto nomatch; - bch2_extent_narrow_crcs(insert, + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, extent_i_to_s(insert).s); - bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), - op->opts.background_target, - op->opts.data_replicas); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), + op->opts.background_target, + op->opts.data_replicas); /* * If we're not fully overwriting @k, and it's compressed, we * need a reservation for all the pointers in @insert */ - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - + nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - m->nr_ptrs_reserved; if (insert->k.size < k.k->size && @@ -151,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) } bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &insert->k_i)); + BTREE_INSERT_ENTRY(iter, insert)); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), @@ -216,10 +214,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, struct bch_io_opts io_opts, enum data_cmd data_cmd, struct data_opts data_opts, + enum btree_id btree_id, struct bkey_s_c k) { int ret; + m->btree_id = btree_id; m->data_cmd = data_cmd; m->data_opts = data_opts; m->nr_ptrs_reserved = 0; @@ -267,11 +267,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, break; } case DATA_REWRITE: { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned compressed_sectors = 0; - extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && p.crc.compression_type != BCH_COMPRESSION_NONE && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) @@ -395,14 +396,16 @@ static int bch2_move_extent(struct bch_fs *c, struct moving_context *ctxt, struct write_point_specifier wp, struct bch_io_opts io_opts, - struct bkey_s_c_extent e, + enum btree_id btree_id, + struct bkey_s_c k, enum data_cmd data_cmd, struct data_opts data_opts) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct moving_io *io; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - unsigned sectors = e.k->size, pages; + unsigned sectors = k.k->size, pages; int ret = -ENOMEM; move_ctxt_wait_event(ctxt, @@ -414,7 +417,7 @@ static int bch2_move_extent(struct bch_fs *c, SECTORS_IN_FLIGHT_PER_DEVICE); /* write path might have to decompress data: */ - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); @@ -424,8 +427,8 @@ static int bch2_move_extent(struct bch_fs *c, goto err; io->write.ctxt = ctxt; - io->read_sectors = e.k->size; - io->write_sectors = e.k->size; + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->write.op.wbio.bio, @@ -442,18 +445,18 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_iter.bi_size = sectors << 9; bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k); + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, - data_cmd, data_opts, e.s_c); + data_cmd, data_opts, btree_id, k); if (ret) goto err_free_pages; atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(e.k->size, &ctxt->stats->sectors_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - trace_move_extent(e.k); + trace_move_extent(k.k); atomic_add(io->read_sectors, &ctxt->read_sectors); list_add_tail(&io->list, &ctxt->reads); @@ -463,7 +466,7 @@ static int bch2_move_extent(struct bch_fs *c, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(c, &io->rbio, e.s_c, + bch2_read_extent(c, &io->rbio, k, 0, BCH_READ_NODECODE| BCH_READ_LAST_FRAGMENT); return 0; @@ -472,20 +475,21 @@ err_free_pages: err_free: kfree(io); err: - trace_move_alloc_fail(e.k); + trace_move_alloc_fail(k.k); return ret; } -int bch2_move_data(struct bch_fs *c, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats) +static int __bch2_move_data(struct bch_fs *c, + struct moving_context *ctxt, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats, + enum btree_id btree_id) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct moving_context ctxt = { .stats = stats }; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct btree_trans trans; @@ -496,17 +500,13 @@ int bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; - stats->btree_id = BTREE_ID_EXTENTS; + stats->btree_id = btree_id; stats->pos = POS_MIN; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + iter = bch2_trans_get_iter(&trans, btree_id, start, BTREE_ITER_PREFETCH); if (rate) @@ -531,7 +531,7 @@ int bch2_move_data(struct bch_fs *c, if (unlikely(freezing(current))) { bch2_trans_unlock(&trans); - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); @@ -582,13 +582,12 @@ peek: k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, - bkey_s_c_to_extent(k), + ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); + bch2_move_ctxt_wait_for_io(ctxt); continue; } @@ -606,7 +605,32 @@ next_nondata: bch2_trans_cond_resched(&trans); } out: - bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; + + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats) +{ + struct moving_context ctxt = { .stats = stats }; + int ret; + + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + + stats->data_type = BCH_DATA_USER; + + ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_EXTENTS) ?: + __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_REFLINK); move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 71b3d2b2..0acd1720 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -25,6 +25,7 @@ struct data_opts { }; struct migrate_write { + enum btree_id btree_id; enum data_cmd data_cmd; struct data_opts data_opts; @@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, struct write_point_specifier, struct bch_io_opts, enum data_cmd, struct data_opts, - struct bkey_s_c); + enum btree_id, struct bkey_s_c); typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index b13af566..71029604 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -69,26 +69,19 @@ static bool __copygc_pred(struct bch_dev *ca, struct bkey_s_c k) { copygc_heap *h = &ca->copygc_heap; + const struct bch_extent_ptr *ptr = + bch2_bkey_has_device(k, ca->dev_idx); - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr = - bch2_extent_has_device(e, ca->dev_idx); + if (ptr) { + struct copygc_heap_entry search = { .offset = ptr->offset }; - if (ptr) { - struct copygc_heap_entry search = { .offset = ptr->offset }; + ssize_t i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); - ssize_t i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); - - return (i >= 0 && - ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].gen); - } - break; - } + return (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].gen); } return false; diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 6bdd6817..4797d620 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -38,9 +38,9 @@ void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bkey_s_c_extent e; if (!bkey_extent_is_data(k.k)) return; @@ -49,9 +49,7 @@ void bch2_rebalance_add_key(struct bch_fs *c, !io_opts->background_compression) return; - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (rebalance_ptr_pred(c, p, io_opts)) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 92867b5c..f2899ba9 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, + struct bkey_i *k) { struct btree_trans trans; struct btree_iter *iter, *split_iter; @@ -247,6 +248,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i *split; + struct bpos atomic_end; bool split_compressed = false; int ret; @@ -254,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, btree_id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); @@ -273,9 +275,14 @@ retry: if (ret) goto err; + ret = bch2_extent_atomic_end(&trans, split_iter, + k, &atomic_end); + if (ret) + goto err; + if (!split_compressed && bch2_extent_is_compressed(bkey_i_to_s_c(k)) && - !bch2_extent_is_atomic(k, split_iter)) { + bkey_cmp(atomic_end, k->k.p) < 0) { ret = bch2_disk_reservation_add(c, &disk_res, k->k.size * bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), @@ -287,7 +294,7 @@ retry: bkey_copy(split, k); bch2_cut_front(split_iter->pos, split); - bch2_extent_trim_atomic(split, split_iter); + bch2_cut_back(atomic_end, &split->k); bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split)); bch2_btree_iter_set_pos(iter, split->k.p); @@ -295,7 +302,7 @@ retry: if (split_compressed) { ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), - -((s64) k->k.size), + 0, -((s64) k->k.size), BCH_BUCKET_MARK_OVERWRITE) ?: bch2_trans_commit(&trans, &disk_res, NULL, BTREE_INSERT_ATOMIC| @@ -335,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c, for_each_journal_key(keys, i) { replay_now_at(j, keys.journal_seq_base + i->journal_seq); - switch (i->btree_id) { - case BTREE_ID_ALLOC: + if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - break; - case BTREE_ID_EXTENTS: - ret = bch2_extent_replay_key(c, i->k); - break; - default: + else if (btree_node_type_is_extents(i->btree_id)) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else ret = bch2_btree_insert(c, i->btree_id, i->k, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY| BTREE_INSERT_NOMARK); - break; - } if (ret) { bch_err(c, "journal replay: error %d while replaying key", diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c new file mode 100644 index 00000000..dcca9c1d --- /dev/null +++ b/libbcachefs/reflink.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "fs-io.h" +#include "reflink.h" + +#include + +/* reflink pointers */ + +const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + if (bkey_val_bytes(p.k) != sizeof(*p.v)) + return "incorrect value size"; + + return NULL; +} + +void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); +} + +enum merge_result bch2_reflink_p_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); + struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); + + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return BCH_MERGE_NOMERGE; + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + __bch2_cut_front(l.k->p, _r); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} + +/* indirect extents */ + +const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + if (bkey_val_bytes(r.k) < sizeof(*r.v)) + return "incorrect value size"; + + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +/* + * bch2_remap_range() depends on bch2_extent_update(), which depends on various + * things tied to the linux vfs for inode updates, for now: + */ +#ifndef NO_BCACHEFS_FS + +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i_extent *e) +{ + struct bch_fs *c = trans->c; + struct btree_iter *reflink_iter; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + struct bkey_i_reflink_p *r_p; + int ret; + + for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + if (reflink_iter->pos.inode) { + bch2_btree_iter_set_pos(reflink_iter, POS_MIN); + continue; + } + + if (bkey_deleted(k.k) && e->k.size <= k.k->size) + break; + } + + if (ret) + goto err; + + /* rewind iter to start of hole, if necessary: */ + bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + + r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + + bkey_reflink_v_init(&r_v->k_i); + r_v->k.p = reflink_iter->pos; + bch2_key_resize(&r_v->k, e->k.size); + r_v->k.version = e->k.version; + + set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + + bkey_val_u64s(&e->k)); + r_v->v.refcount = 0; + memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i)); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) + return PTR_ERR(r_p); + + e->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(&e->k_i); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i)); +err: + if (!IS_ERR(reflink_iter)) { + c->reflink_hint = reflink_iter->pos.offset; + bch2_trans_iter_put(trans, reflink_iter); + } + + return ret; +} + +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +{ + struct bkey_s_c k = bch2_btree_iter_peek(iter); + + while (1) { + if (bkey_err(k)) + return k; + + if (bkey_cmp(iter->pos, end) >= 0) + return bkey_s_c_null; + + if (k.k->type == KEY_TYPE_extent || + k.k->type == KEY_TYPE_reflink_p) + return k; + + k = bch2_btree_iter_next(iter); + } +} + +s64 bch2_remap_range(struct bch_fs *c, + struct bch_inode_info *dst_inode, + struct bpos dst_start, struct bpos src_start, + u64 remap_sectors, u64 new_i_size) +{ + struct btree_trans trans; + struct btree_iter *dst_iter, *src_iter; + struct bkey_s_c src_k; + BKEY_PADDED(k) new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos dst_want, src_want; + u64 src_done, dst_done; + int ret = 0; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_REFLINK); + + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, + BTREE_ITER_INTENT, 1); + dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, + BTREE_ITER_INTENT, 2); + + while (1) { + bch2_trans_begin_updates(&trans); + trans.mem_top = 0; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto err; + } + + src_k = get_next_src(src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + goto btree_err; + + src_done = bpos_min(src_iter->pos, src_end).offset - + src_start.offset; + dst_want = POS(dst_start.inode, dst_start.offset + src_done); + + if (bkey_cmp(dst_iter->pos, dst_want) < 0) { + ret = bch2_fpunch_at(&trans, dst_iter, dst_want, + dst_inode, new_i_size); + if (ret) + goto btree_err; + continue; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); + + if (!bkey_cmp(dst_iter->pos, dst_end)) + break; + + if (src_k.k->type == KEY_TYPE_extent) { + bkey_reassemble(&new_src.k, src_k); + src_k = bkey_i_to_s_c(&new_src.k); + + bch2_cut_front(src_iter->pos, &new_src.k); + bch2_cut_back(src_end, &new_src.k.k); + + ret = bch2_make_extent_indirect(&trans, src_iter, + bkey_i_to_extent(&new_src.k)); + if (ret) + goto btree_err; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } + + if (src_k.k->type == KEY_TYPE_reflink_p) { + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(&new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_iter->pos.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); + } else { + BUG(); + } + + new_dst.k.k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k.k, + min(src_k.k->p.offset - src_iter->pos.offset, + dst_end.offset - dst_iter->pos.offset)); + + ret = bch2_extent_update(&trans, dst_inode, NULL, NULL, + dst_iter, &new_dst.k, + new_i_size, false, true, NULL); + if (ret) + goto btree_err; + + dst_done = dst_iter->pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(src_iter, src_want); +btree_err: + if (ret == -EINTR) + ret = 0; + if (ret) + goto err; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); +err: + BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + + dst_done = dst_iter->pos.offset - dst_start.offset; + new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + + ret = bch2_trans_exit(&trans) ?: ret; + + mutex_lock(&dst_inode->ei_update_lock); + if (dst_inode->v.i_size < new_i_size) { + i_size_write(&dst_inode->v, new_i_size); + ret = bch2_write_inode_size(c, dst_inode, new_i_size, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&dst_inode->ei_update_lock); + + return dst_done ?: ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h new file mode 100644 index 00000000..327618c3 --- /dev/null +++ b/libbcachefs/reflink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + +const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +enum merge_result bch2_reflink_p_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ +} + +const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + + +#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ +} + +#ifndef NO_BCACHEFS_FS +s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *, + struct bpos, struct bpos, u64, u64); +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 4818453c..f84de35c 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, extent_to_replicas(k, e); break; case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: e->data_type = BCH_DATA_USER; extent_to_replicas(k, e); break;