From 7f3557f57efb6e22aa90fdaca481907f633ceb08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 10 Oct 2019 18:04:36 -0400 Subject: [PATCH] Update bcachefs sources to 7e03c1ab0e bcachefs: Kill bchfs_extent_update() --- .bcachefs_revision | 2 +- include/linux/sched/signal.h | 11 + libbcachefs/bcachefs_format.h | 4 +- libbcachefs/btree_gc.c | 28 +- libbcachefs/btree_iter.c | 8 + libbcachefs/btree_update_leaf.c | 9 + libbcachefs/buckets.c | 161 ++++---- libbcachefs/buckets.h | 6 +- libbcachefs/dirent.c | 26 +- libbcachefs/dirent.h | 2 +- libbcachefs/ec.c | 39 +- libbcachefs/extents.c | 33 +- libbcachefs/extents.h | 5 +- libbcachefs/extents_types.h | 4 +- libbcachefs/fs-common.c | 25 +- libbcachefs/fs-common.h | 3 +- libbcachefs/fs-io.c | 653 +++++++++++--------------------- libbcachefs/fs-io.h | 10 - libbcachefs/fs.c | 28 -- libbcachefs/inode.h | 9 + libbcachefs/io.c | 305 ++++++++++++--- libbcachefs/io.h | 9 + libbcachefs/io_types.h | 2 + libbcachefs/recovery.c | 6 +- libbcachefs/reflink.c | 65 ++-- libbcachefs/reflink.h | 6 +- libbcachefs/replicas.c | 4 +- libbcachefs/super-io.c | 7 +- 28 files changed, 747 insertions(+), 723 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 2ad0374a..695629a4 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ce9293e9d063f7f1a22209f9cc2f5cb7478e886c +7e03c1ab0ef2e3148ba70656eab67471c85a0419 diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e69de29b..20bdc050 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SIGNAL_H +#define _LINUX_SCHED_SIGNAL_H + +static inline int fatal_signal_pending(struct task_struct *p) +{ + return 0; +} + +#endif /* _LINUX_SCHED_SIGNAL_H */ + diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d4b04ef0..d619e5ca 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1503,14 +1503,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); /* Btree: */ -#define BCH_BTREE_IDS() \ +#define BCH_BTREE_IDS() \ x(EXTENTS, 0, "extents") \ x(INODES, 1, "inodes") \ x(DIRENTS, 2, "dirents") \ x(XATTRS, 3, "xattrs") \ x(ALLOC, 4, "alloc") \ x(QUOTAS, 5, "quotas") \ - x(EC, 6, "erasure_coding") \ + x(EC, 6, "stripes") \ x(REFLINK, 7, "reflink") enum btree_id { diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index f4adb07a..8b114d4f 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -216,7 +216,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, : expensive_debug_checks(c) ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale; + u8 max_stale = 0; int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -640,12 +640,7 @@ static int bch2_gc_start(struct bch_fs *c, { struct bch_dev *ca; unsigned i; - - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); + int ret; BUG_ON(c->usage_gc); @@ -673,6 +668,18 @@ static int bch2_gc_start(struct bch_fs *c, } } + ret = bch2_ec_mem_alloc(c, true); + if (ret) + return ret; + + percpu_down_write(&c->mark_lock); + + /* + * indicate to stripe code that we need to allocate for the gc stripes + * radix tree, too + */ + gc_pos_set(c, gc_phase(GC_PHASE_START)); + for_each_member_device(ca, c, i) { struct bucket_array *dst = __bucket_array(ca, 1); struct bucket_array *src = __bucket_array(ca, 0); @@ -697,7 +704,9 @@ static int bch2_gc_start(struct bch_fs *c, } }; - return bch2_ec_mem_alloc(c, true); + percpu_up_write(&c->mark_lock); + + return 0; } /** @@ -730,10 +739,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, down_write(&c->gc_lock); again: - percpu_down_write(&c->mark_lock); ret = bch2_gc_start(c, metadata_only); - percpu_up_write(&c->mark_lock); - if (ret) goto out; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index f7039df9..e7f7820f 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1440,6 +1440,14 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } + if (unlikely(bkey_deleted(&iter->k))) { + /* + * we're currently pointed at a hole, because previously we were + * iterating over slots: + */ + return bch2_btree_iter_peek(iter); + } + do { bch2_btree_node_iter_advance(&l->iter, l->b); p = bch2_btree_node_iter_peek_all(&l->iter, l->b); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 536ac921..271f9114 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -451,6 +451,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; + struct btree_iter *iter; unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ? BCH_BUCKET_MARK_BUCKET_INVALIDATE : 0; @@ -473,6 +474,14 @@ static inline int do_btree_insert_at(struct btree_trans *trans, goto out_clear_replicas; } + trans_for_each_iter(trans, iter) { + if (iter->nodes_locked != iter->nodes_intent_locked) { + BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + BUG_ON(trans->iters_live & (1ULL << iter->idx)); + __bch2_btree_iter_unlock(iter); + } + } + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) trans_for_each_update(trans, i) btree_insert_entry_checks(trans, i); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 72cc11b2..8481c707 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -807,26 +807,42 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } +static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) +{ + return DIV_ROUND_UP(sectors * n, d); +} + +static s64 __ptr_disk_sectors_delta(unsigned old_size, + unsigned offset, s64 delta, + unsigned flags, + unsigned n, unsigned d) +{ + BUG_ON(!n || !d); + + if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { + BUG_ON(offset + -delta > old_size); + + return -disk_sectors_scaled(n, d, old_size) + + disk_sectors_scaled(n, d, offset) + + disk_sectors_scaled(n, d, old_size - offset + delta); + } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { + BUG_ON(offset + -delta > old_size); + + return -disk_sectors_scaled(n, d, old_size) + + disk_sectors_scaled(n, d, old_size + delta); + } else { + return disk_sectors_scaled(n, d, delta); + } +} + static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, unsigned offset, s64 delta, unsigned flags) { - if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { - BUG_ON(offset + -delta > p.crc.live_size); - - return -((s64) ptr_disk_sectors(p)) + - __ptr_disk_sectors(p, offset) + - __ptr_disk_sectors(p, p.crc.live_size - - offset + delta); - } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { - BUG_ON(offset + -delta > p.crc.live_size); - - return -((s64) ptr_disk_sectors(p)) + - __ptr_disk_sectors(p, p.crc.live_size + - delta); - } else { - return ptr_disk_sectors(p); - } + return __ptr_disk_sectors_delta(p.crc.live_size, + offset, delta, flags, + p.crc.compressed_size, + p.crc.uncompressed_size); } static void bucket_set_stripe(struct bch_fs *c, @@ -964,15 +980,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - s64 sectors, unsigned flags) + s64 sectors, unsigned flags, + struct bch_replicas_padded *r, + unsigned *nr_data, + unsigned *nr_parity) { bool gc = flags & BCH_BUCKET_MARK_GC; struct stripe *m; - unsigned old, new, nr_data; + unsigned old, new; int blocks_nonempty_delta; - s64 parity_sectors; - - BUG_ON(!sectors); m = genradix_ptr(&c->stripes[gc], p.idx); @@ -987,13 +1003,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, BUG_ON(m->r.e.data_type != data_type); - nr_data = m->nr_blocks - m->nr_redundant; - - parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); - - if (sectors < 0) - parity_sectors = -parity_sectors; - sectors += parity_sectors; + *nr_data = m->nr_blocks - m->nr_redundant; + *nr_parity = m->nr_redundant; + *r = m->r; old = m->block_sectors[p.block]; m->block_sectors[p.block] += sectors; @@ -1011,8 +1023,6 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); - update_replicas(c, fs_usage, &m->r.e, sectors); - return 0; } @@ -1027,7 +1037,6 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, struct extent_ptr_decoded p; struct bch_replicas_padded r; s64 dirty_sectors = 0; - unsigned i; int ret; r.e.data_type = data_type; @@ -1041,29 +1050,46 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, ? sectors : ptr_disk_sectors_delta(p, offset, sectors, flags); bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, - fs_usage, journal_seq, flags); + fs_usage, journal_seq, flags); if (p.ptr.cached) { if (!stale) update_cached_sectors(c, fs_usage, p.ptr.dev, disk_sectors); - } else if (!p.ec_nr) { + } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - for (i = 0; i < p.ec_nr; i++) { - ret = bch2_mark_stripe_ptr(c, p.ec[i], - data_type, fs_usage, - disk_sectors, flags); - if (ret) - return ret; - } + struct bch_replicas_padded ec_r; + unsigned nr_data, nr_parity; + s64 parity_sectors; + ret = bch2_mark_stripe_ptr(c, p.ec, data_type, + fs_usage, disk_sectors, flags, + &ec_r, &nr_data, &nr_parity); + if (ret) + return ret; + + parity_sectors = + __ptr_disk_sectors_delta(p.crc.live_size, + offset, sectors, flags, + p.crc.compressed_size * nr_parity, + p.crc.uncompressed_size * nr_data); + + update_replicas(c, fs_usage, &ec_r.e, + disk_sectors + parity_sectors); + + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ r.e.nr_required = 0; } } - update_replicas(c, fs_usage, &r.e, dirty_sectors); + if (r.e.nr_devs) + update_replicas(c, fs_usage, &r.e, dirty_sectors); return 0; } @@ -1501,16 +1527,16 @@ out: static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, - s64 sectors, enum bch_data_type data_type) + s64 sectors, enum bch_data_type data_type, + struct bch_replicas_padded *r, + unsigned *nr_data, + unsigned *nr_parity) { struct bch_fs *c = trans->c; - struct bch_replicas_padded r; struct btree_iter *iter; struct bkey_i *new_k; struct bkey_s_c k; struct bkey_s_stripe s; - unsigned nr_data; - s64 parity_sectors; int ret = 0; ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); @@ -1533,20 +1559,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, bkey_reassemble(new_k, k); s = bkey_i_to_s_stripe(new_k); - nr_data = s.v->nr_blocks - s.v->nr_redundant; - - parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data); - - if (sectors < 0) - parity_sectors = -parity_sectors; - stripe_blockcount_set(s.v, p.block, stripe_blockcount_get(s.v, p.block) + - sectors + parity_sectors); + sectors); - bch2_bkey_to_replicas(&r.e, s.s_c); - - update_replicas_list(trans, &r.e, sectors); + *nr_data = s.v->nr_blocks - s.v->nr_redundant; + *nr_parity = s.v->nr_redundant; + bch2_bkey_to_replicas(&r->e, s.s_c); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1563,7 +1582,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, struct bch_replicas_padded r; s64 dirty_sectors = 0; bool stale; - unsigned i; int ret; r.e.data_type = data_type; @@ -1588,22 +1606,35 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, if (!stale) update_cached_sectors_list(trans, p.ptr.dev, disk_sectors); - } else if (!p.ec_nr) { + } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - for (i = 0; i < p.ec_nr; i++) { - ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i], - disk_sectors, data_type); - if (ret) - return ret; - } + struct bch_replicas_padded ec_r; + unsigned nr_data, nr_parity; + s64 parity_sectors; + + ret = bch2_trans_mark_stripe_ptr(trans, p.ec, + disk_sectors, data_type, + &ec_r, &nr_data, &nr_parity); + if (ret) + return ret; + + parity_sectors = + __ptr_disk_sectors_delta(p.crc.live_size, + offset, sectors, flags, + p.crc.compressed_size * nr_parity, + p.crc.uncompressed_size * nr_data); + + update_replicas_list(trans, &ec_r.e, + disk_sectors + parity_sectors); r.e.nr_required = 0; } } - update_replicas_list(trans, &r.e, dirty_sectors); + if (r.e.nr_devs) + update_replicas_list(trans, &r.e, dirty_sectors); return 0; } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index a4bab66d..8ab18b55 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -137,8 +137,8 @@ static inline u8 ptr_stale(struct bch_dev *ca, return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } -static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p, - unsigned live_size) +static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, + unsigned live_size) { return live_size && p.crc.compression_type ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, @@ -146,7 +146,7 @@ static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p, : live_size; } -static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p) +static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) { return __ptr_disk_sectors(p, p.crc.live_size); } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 304128d7..38017699 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -296,10 +296,10 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, struct btree_iter * __bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, const struct bch_hash_info *hash_info, - const struct qstr *name) + const struct qstr *name, unsigned flags) { return bch2_hash_lookup(trans, bch2_dirent_hash_desc, - hash_info, dir_inum, name, 0); + hash_info, dir_inum, name, flags); } u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, @@ -313,7 +313,8 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, bch2_trans_init(&trans, c, 0, 0); - iter = __bch2_dirent_lookup_trans(&trans, dir_inum, hash_info, name); + iter = __bch2_dirent_lookup_trans(&trans, dir_inum, + hash_info, name, 0); if (IS_ERR(iter)) { BUG_ON(PTR_ERR(iter) == -EINTR); goto out; @@ -353,36 +354,31 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; - unsigned len; int ret; bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(inum, ctx->pos), 0, k, ret) { + if (k.k->p.inode > inum) + break; + if (k.k->type != KEY_TYPE_dirent) continue; dirent = bkey_s_c_to_dirent(k); - if (bkey_cmp(k.k->p, POS(inum, ctx->pos)) < 0) - continue; - - if (k.k->p.inode > inum) - break; - - len = bch2_dirent_name_bytes(dirent); - /* * XXX: dir_emit() can fault and block, while we're holding * locks */ - if (!dir_emit(ctx, dirent.v->d_name, len, + ctx->pos = dirent.k->p.offset; + if (!dir_emit(ctx, dirent.v->d_name, + bch2_dirent_name_bytes(dirent), le64_to_cpu(dirent.v->d_inum), dirent.v->d_type)) break; - - ctx->pos = k.k->p.offset + 1; + ctx->pos = dirent.k->p.offset + 1; } ret = bch2_trans_exit(&trans) ?: ret; diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 9a57ad00..e6184dc7 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -55,7 +55,7 @@ int bch2_dirent_rename(struct btree_trans *, struct btree_iter * __bch2_dirent_lookup_trans(struct btree_trans *, u64, const struct bch_hash_info *, - const struct qstr *); + const struct qstr *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 740d3ef7..ad92d3b4 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -135,8 +135,6 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, (u64) s->ptrs[i].offset, stripe_blockcount_get(s, i)); - - bch2_bkey_ptrs_to_text(out, c, k); } static int ptr_matches_stripe(struct bch_fs *c, @@ -433,10 +431,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) closure_init_stack(&cl); - BUG_ON(!rbio->pick.idx || - rbio->pick.idx - 1 >= rbio->pick.ec_nr); + BUG_ON(!rbio->pick.has_ec); - stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; + stripe_idx = rbio->pick.ec.idx; buf = kzalloc(sizeof(*buf), GFP_NOIO); if (!buf) @@ -561,7 +558,7 @@ static int ec_stripe_mem_alloc(struct bch_fs *c, size_t idx = iter->pos.offset; int ret = 0; - if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT)) + if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) return ret; bch2_trans_unlock(iter->trans); @@ -1278,7 +1275,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) struct btree_trans trans; struct btree_iter *btree_iter; struct journal_iter journal_iter; - struct bkey_s_c btree_k, journal_k, k; + struct bkey_s_c btree_k, journal_k; int ret; ret = bch2_fs_ec_start(c); @@ -1294,33 +1291,31 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) journal_k = bch2_journal_iter_peek(&journal_iter); while (1) { + bool btree; + if (btree_k.k && journal_k.k) { int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); - if (cmp < 0) { - k = btree_k; + if (!cmp) btree_k = bch2_btree_iter_next(btree_iter); - } else if (cmp == 0) { - btree_k = bch2_btree_iter_next(btree_iter); - k = journal_k; - journal_k = bch2_journal_iter_next(&journal_iter); - } else { - k = journal_k; - journal_k = bch2_journal_iter_next(&journal_iter); - } + btree = cmp < 0; } else if (btree_k.k) { - k = btree_k; - btree_k = bch2_btree_iter_next(btree_iter); + btree = true; } else if (journal_k.k) { - k = journal_k; - journal_k = bch2_journal_iter_next(&journal_iter); + btree = false; } else { break; } - bch2_mark_key(c, k, 0, 0, NULL, 0, + bch2_mark_key(c, btree ? btree_k : journal_k, + 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); + + if (btree) + btree_k = bch2_btree_iter_next(btree_iter); + else + journal_k = bch2_journal_iter_next(&journal_iter); } ret = bch2_trans_exit(&trans) ?: ret; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 1b86d27e..dcd70994 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -67,7 +67,7 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) static unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded p) { - unsigned i, durability = 0; + unsigned durability = 0; struct bch_dev *ca; if (p.ptr.cached) @@ -78,16 +78,16 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, if (ca->mi.state != BCH_MEMBER_STATE_FAILED) durability = max_t(unsigned, durability, ca->mi.durability); - for (i = 0; i < p.ec_nr; i++) { + if (p.has_ec) { struct stripe *s = - genradix_ptr(&c->stripes[0], p.idx); + genradix_ptr(&c->stripes[0], p.ec.idx); if (WARN_ON(!s)) - continue; + goto out; durability = max_t(unsigned, durability, s->nr_redundant); } - +out: return durability; } @@ -206,10 +206,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, p.idx++; if (force_reconstruct_read(c) && - !p.idx && p.ec_nr) + !p.idx && p.has_ec) p.idx++; - if (p.idx >= p.ec_nr + 1) + if (p.idx >= (unsigned) p.has_ec + 1) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -1011,13 +1011,19 @@ int bch2_extent_atomic_end(struct btree_iter *iter, struct bpos *end) { struct btree_trans *trans = iter->trans; - struct btree *b = iter->l[0].b; - struct btree_node_iter node_iter = iter->l[0].iter; + struct btree *b; + struct btree_node_iter node_iter; struct bkey_packed *_k; unsigned nr_iters = 0; int ret; - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + b = iter->l[0].b; + node_iter = iter->l[0].iter; + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); *end = bpos_min(insert->k.p, b->key.k.p); @@ -1538,7 +1544,6 @@ void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&k->k, NULL); union bch_extent_entry *pos; - unsigned i; if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = ptrs.start; @@ -1557,9 +1562,9 @@ found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; __extent_entry_insert(k, pos, to_entry(&p->ptr)); - for (i = 0; i < p->ec_nr; i++) { - p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec[i])); + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); } } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 613d76af..67abc3c8 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -228,7 +228,7 @@ struct bkey_ptrs { __label__ out; \ \ (_ptr).idx = 0; \ - (_ptr).ec_nr = 0; \ + (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (extent_entry_type(_entry)) { \ @@ -242,7 +242,8 @@ struct bkey_ptrs { entry_to_crc(_entry)); \ break; \ case BCH_EXTENT_ENTRY_stripe_ptr: \ - (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \ + (_ptr).ec = _entry->stripe_ptr; \ + (_ptr).has_ec = true; \ break; \ } \ out: \ diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index a8dd6952..43d6c341 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -21,10 +21,10 @@ struct bch_extent_crc_unpacked { struct extent_ptr_decoded { unsigned idx; - unsigned ec_nr; + bool has_ec; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; - struct bch_extent_stripe_ptr ec[4]; + struct bch_extent_stripe_ptr ec; }; struct bch_io_failures { diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index fdd2b9b6..a4497eeb 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -24,8 +24,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, u64 now = bch2_current_time(trans->c); int ret; - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, - name ? BTREE_ITER_INTENT : 0); + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); if (IS_ERR(dir_iter)) return PTR_ERR(dir_iter); @@ -76,8 +75,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, return 0; } -int bch2_link_trans(struct btree_trans *trans, - u64 dir_inum, +int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 inum, struct bch_inode_unpacked *inode_u, const struct qstr *name) { @@ -86,19 +84,22 @@ int bch2_link_trans(struct btree_trans *trans, struct bch_hash_info dir_hash; u64 now = bch2_current_time(trans->c); - dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); - if (IS_ERR(dir_iter)) - return PTR_ERR(dir_iter); - inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); if (IS_ERR(inode_iter)) return PTR_ERR(inode_iter); - dir_hash = bch2_hash_info_init(trans->c, &dir_u); - inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); + dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + /* XXX: shouldn't we be updating mtime/ctime on the directory? */ + + dir_hash = bch2_hash_info_init(trans->c, &dir_u); + bch2_trans_iter_put(trans, dir_iter); + return bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum, BCH_HASH_SET_MUST_CREATE) ?: @@ -121,8 +122,8 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_hash = bch2_hash_info_init(trans->c, dir_u); - dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, - &dir_hash, name); + dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, + name, BTREE_ITER_INTENT); if (IS_ERR(dirent_iter)) return PTR_ERR(dirent_iter); diff --git a/libbcachefs/fs-common.h b/libbcachefs/fs-common.h index 7adcfcf9..c1621485 100644 --- a/libbcachefs/fs-common.h +++ b/libbcachefs/fs-common.h @@ -12,8 +12,7 @@ int bch2_create_trans(struct btree_trans *, u64, struct posix_acl *, struct posix_acl *); -int bch2_link_trans(struct btree_trans *, - u64, +int bch2_link_trans(struct btree_trans *, u64, u64, struct bch_inode_unpacked *, const struct qstr *); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 6c2832ff..770fed19 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -36,25 +36,14 @@ struct quota_res { u64 sectors; }; -struct bchfs_write_op { +struct bch_writepage_io { + struct closure cl; struct bch_inode_info *inode; - s64 sectors_added; - bool is_dio; - bool unalloc; - u64 new_i_size; /* must be last: */ struct bch_write_op op; }; -struct bch_writepage_io { - struct closure cl; - u64 new_sectors; - - /* must be last: */ - struct bchfs_write_op op; -}; - struct dio_write { struct closure cl; struct kiocb *req; @@ -68,7 +57,7 @@ struct dio_write { struct iovec inline_vecs[2]; /* must be last: */ - struct bchfs_write_op iop; + struct bch_write_op op; }; struct dio_read { @@ -229,220 +218,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, mutex_unlock(&inode->ei_quota_lock); } -/* normal i_size/i_sectors update machinery: */ - -static int sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool may_allocate, - s64 *delta) -{ - struct btree_iter *iter; - struct bkey_s_c old; - int ret = 0; - - *delta = 0; - - iter = bch2_trans_copy_iter(trans, extent_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { - if (!may_allocate && - bch2_bkey_nr_ptrs_allocated(old) < - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) { - ret = -ENOSPC; - break; - } - - *delta += (min(new->k.p.offset, - old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k))) * - (bkey_extent_is_allocation(&new->k) - - bkey_extent_is_allocation(old.k)); - - if (bkey_cmp(old.k->p, new->k.p) >= 0) - break; - } - - bch2_trans_iter_put(trans, iter); - return ret; -} - -int bch2_extent_update(struct btree_trans *trans, - struct bch_inode_info *inode, - struct disk_reservation *disk_res, - struct quota_res *quota_res, - struct btree_iter *extent_iter, - struct bkey_i *k, - u64 new_i_size, - bool may_allocate, - bool direct, - s64 *total_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter *inode_iter = NULL; - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - bool extended = false; - s64 i_sectors_delta; - int ret; - - ret = bch2_btree_iter_traverse(extent_iter); - if (ret) - return ret; - - ret = bch2_extent_trim_atomic(k, extent_iter); - if (ret) - return ret; - - ret = sum_sector_overwrites(trans, extent_iter, k, - may_allocate, &i_sectors_delta); - if (ret) - return ret; - - bch2_trans_update(trans, extent_iter, k); - - new_i_size = min(k->k.p.offset << 9, new_i_size); - - /* XXX: inode->i_size locking */ - if (i_sectors_delta || - new_i_size > inode->ei_inode.bi_size) { - inode_iter = bch2_inode_peek(trans, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); - - inode_u.bi_sectors += i_sectors_delta; - - /* - * XXX: can BCH_INODE_I_SIZE_DIRTY be true here? i.e. can we - * race with truncate? - */ - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) { - inode_u.bi_size = new_i_size; - extended = true; - } - - if (i_sectors_delta || extended) { - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, inode_iter, - &inode_p.inode.k_i); - } - } - - ret = bch2_trans_commit(trans, disk_res, - &inode->ei_journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_USE_RESERVE); - if (ret) - goto err; - - if (i_sectors_delta || extended) { - inode->ei_inode.bi_sectors = inode_u.bi_sectors; - inode->ei_inode.bi_size = inode_u.bi_size; - } - - if (direct) - i_sectors_acct(c, inode, quota_res, i_sectors_delta); - if (direct && extended) { - spin_lock(&inode->v.i_lock); - if (new_i_size > inode->v.i_size) - i_size_write(&inode->v, new_i_size); - spin_unlock(&inode->v.i_lock); - } - - if (total_delta) - *total_delta += i_sectors_delta; -err: - if (!IS_ERR_OR_NULL(inode_iter)) - bch2_trans_iter_put(trans, inode_iter); - return ret; -} - -static int bchfs_write_index_update(struct bch_write_op *wop) -{ - struct bch_fs *c = wop->c; - struct bchfs_write_op *op = container_of(wop, - struct bchfs_write_op, op); - struct quota_res *quota_res = op->is_dio - ? &container_of(op, struct dio_write, iop)->quota_res - : NULL; - struct bch_inode_info *inode = op->inode; - struct keylist *keys = &op->op.insert_keys; - struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans trans; - struct btree_iter *iter; - int ret; - - BUG_ON(k->k.p.inode != inode->v.i_ino); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - do { - BKEY_PADDED(k) tmp; - - bkey_copy(&tmp.k, bch2_keylist_front(keys)); - - bch2_trans_begin_updates(&trans); - - ret = bch2_extent_update(&trans, inode, - &wop->res, quota_res, - iter, &tmp.k, - op->new_i_size, - !op->unalloc, - op->is_dio, - &op->sectors_added); - if (ret == -EINTR) - continue; - if (ret) - break; - - if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); - else - bch2_keylist_pop_front(keys); - } while (!bch2_keylist_empty(keys)); - - bch2_trans_exit(&trans); - - return ret; -} - -static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, - struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_io_opts opts, - bool is_dio) -{ - op->inode = inode; - op->sectors_added = 0; - op->is_dio = is_dio; - op->unalloc = false; - op->new_i_size = U64_MAX; - - bch2_write_op_init(&op->op, c, opts); - op->op.target = opts.foreground_target; - op->op.index_update_fn = bchfs_write_index_update; - op_journal_seq_set(&op->op, &inode->ei_journal_seq); -} - -static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode) -{ - struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); - - bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode)); - return opts; -} - /* page state: */ /* stored in page->private: */ @@ -464,6 +239,7 @@ struct bch_page_sector { }; struct bch_page_state { + spinlock_t lock; atomic_t write_count; struct bch_page_sector s[PAGE_SECTORS]; }; @@ -519,6 +295,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, if (!s) return NULL; + spin_lock_init(&s->lock); /* * migrate_page_move_mapping() assumes that pages with private data * have their count elevated by 1. @@ -666,6 +443,9 @@ static void bch2_clear_page_bits(struct page *page) if (!s) return; + EBUG_ON(!PageLocked(page)); + EBUG_ON(PageWriteback(page)); + for (i = 0; i < ARRAY_SIZE(s->s); i++) { disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; @@ -695,6 +475,8 @@ static void bch2_set_page_dirty(struct bch_fs *c, WARN_ON((u64) page_offset(page) + offset + len > round_up((u64) i_size_read(&inode->v), block_bytes(c))); + spin_lock(&s->lock); + for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { @@ -711,6 +493,8 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); } + spin_unlock(&s->lock); + if (dirty_sectors) i_sectors_acct(c, inode, &res->quota, dirty_sectors); @@ -776,9 +560,6 @@ out: void bch2_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); - if (offset || length < PAGE_SIZE) return; @@ -787,10 +568,6 @@ void bch2_invalidatepage(struct page *page, unsigned int offset, int bch2_releasepage(struct page *page, gfp_t gfp_mask) { - /* XXX: this can't take locks that are held while we allocate memory */ - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); - if (PageDirty(page)) return 0; @@ -1091,7 +868,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, inode); + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; struct btree_iter *iter; struct page *page; @@ -1165,7 +942,7 @@ int bch2_readpage(struct file *file, struct page *page) { struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, inode); + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct bch_read_bio *rbio; rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); @@ -1190,7 +967,7 @@ static int bch2_read_single_page(struct page *page, DECLARE_COMPLETION_ONSTACK(done); rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), - io_opts(c, inode)); + io_opts(c, &inode->ei_inode)); rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; @@ -1217,7 +994,9 @@ struct bch_writepage_state { static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, struct bch_inode_info *inode) { - return (struct bch_writepage_state) { .opts = io_opts(c, inode) }; + return (struct bch_writepage_state) { + .opts = io_opts(c, &inode->ei_inode) + }; } static void bch2_writepage_io_free(struct closure *cl) @@ -1225,31 +1004,31 @@ static void bch2_writepage_io_free(struct closure *cl) struct bch_writepage_io *io = container_of(cl, struct bch_writepage_io, cl); - bio_put(&io->op.op.wbio.bio); + bio_put(&io->op.wbio.bio); } static void bch2_writepage_io_done(struct closure *cl) { struct bch_writepage_io *io = container_of(cl, struct bch_writepage_io, cl); - struct bch_fs *c = io->op.op.c; - struct bio *bio = &io->op.op.wbio.bio; + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bvec; unsigned i; - if (io->op.op.error) { + if (io->op.error) { bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); mapping_set_error(bvec->bv_page->mapping, -EIO); - lock_page(bvec->bv_page); - s = bch2_page_state(bvec->bv_page); + s = __bch2_page_state(bvec->bv_page); + spin_lock(&s->lock); for (i = 0; i < PAGE_SECTORS; i++) s->s[i].nr_replicas = 0; - unlock_page(bvec->bv_page); + spin_unlock(&s->lock); } } @@ -1257,22 +1036,20 @@ static void bch2_writepage_io_done(struct closure *cl) * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: */ - BUG_ON(io->op.sectors_added > (s64) io->new_sectors); + BUG_ON(io->op.i_sectors_delta > 0); /* * (error (due to going RO) halfway through a page can screw that up * slightly) * XXX wtf? - BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS); + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); */ /* * PageWriteback is effectively our ref on the inode - fixup i_blocks * before calling end_page_writeback: */ - if (io->op.sectors_added != io->new_sectors) - i_sectors_acct(c, io->op.inode, NULL, - io->op.sectors_added - (s64) io->new_sectors); + i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); @@ -1289,7 +1066,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) struct bch_writepage_io *io = w->io; w->io = NULL; - closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl); + closure_call(&io->op.cl, bch2_write, NULL, &io->cl); continue_at(&io->cl, bch2_writepage_io_done, NULL); } @@ -1308,12 +1085,15 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &c->writepage_bioset), - struct bch_writepage_io, op.op.wbio.bio); + struct bch_writepage_io, op.wbio.bio); closure_init(&w->io->cl, NULL); - w->io->new_sectors = 0; - bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false); - op = &w->io->op.op; + w->io->inode = inode; + + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; + op_journal_seq_set(op, &inode->ei_journal_seq); op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); @@ -1422,32 +1202,31 @@ do_io: } if (w->io && - (w->io->op.op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.op.wbio.bio) || - bio_end_sector(&w->io->op.op.wbio.bio) != sector)) + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.wbio.bio) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); if (!w->io) bch2_writepage_io_alloc(c, w, inode, sector, nr_replicas_this_write); - w->io->new_sectors += dirty_sectors; - atomic_inc(&s->write_count); - BUG_ON(inode != w->io->op.inode); - BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page, + BUG_ON(inode != w->io->inode); + BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, sectors << 9, offset << 9)); /* Check for writing past i_size: */ - WARN_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) > + WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > round_up(i_size, block_bytes(c))); - w->io->op.op.res.sectors += reserved_sectors; + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; if (wbc->sync_mode == WB_SYNC_ALL) - w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; + w->io->op.wbio.bio.bi_opf |= REQ_SYNC; offset += sectors; } @@ -1857,7 +1636,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct file *file = req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, inode); + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct dio_read *dio; struct bio *bio; loff_t offset = req->ki_pos; @@ -1952,14 +1731,15 @@ static void bch2_dio_write_loop_async(struct closure *); static long bch2_dio_write_loop(struct dio_write *dio) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct bch_fs *c = dio->iop.op.c; + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct address_space *mapping = req->ki_filp->f_mapping; - struct bch_inode_info *inode = dio->iop.inode; - struct bio *bio = &dio->iop.op.wbio.bio; + struct bch_inode_info *inode = file_bch_inode(req->ki_filp); + struct bio *bio = &dio->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; unsigned unaligned; + u64 new_i_size; loff_t offset; bool sync; long ret; @@ -1971,7 +1751,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) __pagecache_block_get(&mapping->add_lock); /* Write and invalidate pagecache range that we're writing to: */ - offset = req->ki_pos + (dio->iop.op.written << 9); + offset = req->ki_pos + (dio->op.written << 9); ret = write_invalidate_inode_pages_range(mapping, offset, offset + iov_iter_count(&dio->iter) - 1); @@ -1979,7 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) goto err; while (1) { - offset = req->ki_pos + (dio->iop.op.written << 9); + offset = req->ki_pos + (dio->op.written << 9); BUG_ON(current->pagecache_lock); current->pagecache_lock = &mapping->add_lock; @@ -2017,11 +1797,11 @@ static long bch2_dio_write_loop(struct dio_write *dio) if (unlikely(ret)) goto err; - dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9); + dio->op.pos = POS(inode->v.i_ino, offset >> 9); task_io_account_write(bio->bi_iter.bi_size); - closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl); + closure_call(&dio->op.cl, bch2_write, NULL, &dio->cl); if (!dio->sync && !dio->loop && dio->iter.count) { struct iovec *iov = dio->inline_vecs; @@ -2030,7 +1810,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), GFP_KERNEL); if (unlikely(!iov)) { - dio->iop.op.error = -ENOMEM; + dio->op.error = -ENOMEM; goto err_wait_io; } @@ -2050,17 +1830,28 @@ err_wait_io: closure_sync(&dio->cl); loop: + i_sectors_acct(c, inode, &dio->quota_res, + dio->op.i_sectors_delta); + dio->op.i_sectors_delta = 0; + + new_i_size = req->ki_pos + ((u64) dio->op.written << 9); + + spin_lock(&inode->v.i_lock); + if (new_i_size > inode->v.i_size) + i_size_write(&inode->v, new_i_size); + spin_unlock(&inode->v.i_lock); + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - if (!dio->iter.count || dio->iop.op.error) + if (!dio->iter.count || dio->op.error) break; bio_reset(bio); } - ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9); + ret = dio->op.error ?: ((long) dio->op.written << 9); err: __pagecache_block_put(&mapping->add_lock); - bch2_disk_reservation_put(c, &dio->iop.op.res); + bch2_disk_reservation_put(c, &dio->op.res); bch2_quota_reservation_put(c, inode, &dio->quota_res); if (dio->free_iov) @@ -2095,6 +1886,7 @@ static int bch2_direct_IO_write(struct kiocb *req, struct file *file = req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct dio_write *dio; struct bio *bio; ssize_t ret; @@ -2110,7 +1902,7 @@ static int bch2_direct_IO_write(struct kiocb *req, bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, iop.op.wbio.bio); + dio = container_of(bio, struct dio_write, op.wbio.bio); closure_init(&dio->cl, NULL); dio->req = req; dio->mm = current->mm; @@ -2120,36 +1912,36 @@ static int bch2_direct_IO_write(struct kiocb *req, dio->free_iov = false; dio->quota_res.sectors = 0; dio->iter = *iter; - bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true); - dio->iop.op.write_point = writepoint_hashed((unsigned long) current); - dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION; + + bch2_write_op_init(&dio->op, c, opts); + dio->op.target = opts.foreground_target; + op_journal_seq_set(&dio->op, &inode->ei_journal_seq); + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION; if ((req->ki_flags & IOCB_DSYNC) && !c->opts.journal_flush_disabled) - dio->iop.op.flags |= BCH_WRITE_FLUSH; + dio->op.flags |= BCH_WRITE_FLUSH; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, iter->count >> 9, true); if (unlikely(ret)) goto err; - dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas; + dio->op.nr_replicas = dio->op.opts.data_replicas; - ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, - dio->iop.op.opts.data_replicas, 0); - if (unlikely(ret)) { - if (!bch2_check_range_allocated(c, POS(inode->v.i_ino, - req->ki_pos >> 9), - iter->count >> 9, - dio->iop.op.opts.data_replicas)) - goto err; - - dio->iop.unalloc = true; - } + ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9, + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_check_range_allocated(c, POS(inode->v.i_ino, + req->ki_pos >> 9), + iter->count >> 9, + dio->op.opts.data_replicas)) + goto err; return bch2_dio_write_loop(dio); err: - bch2_disk_reservation_put(c, &dio->iop.op.res); + bch2_disk_reservation_put(c, &dio->op.res); bch2_quota_reservation_put(c, inode, &dio->quota_res); closure_debug_destroy(&dio->cl); bio_put(bio); @@ -2250,80 +2042,6 @@ out: /* truncate: */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, struct bch_inode_info *inode, - u64 new_i_size) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - - while ((k = bch2_btree_iter_peek(iter)).k && - bkey_cmp(iter->pos, end) < 0) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto btree_err; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); - - bch2_trans_begin_updates(trans); - - ret = bch2_extent_update(trans, inode, - &disk_res, NULL, iter, &delete, - new_i_size, false, true, NULL); - bch2_disk_reservation_put(c, &disk_res); -btree_err: - if (ret == -EINTR) { - ret2 = ret; - ret = 0; - } - if (ret) - break; - } - - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); - } - - return ret ?: ret2; -} - -static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, - u64 start_offset, u64 end_offset) -{ - struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, start_offset), - BTREE_ITER_INTENT); - - ret = bch2_fpunch_at(&trans, iter, - POS(inode->v.i_ino, end_offset), - inode, 0); - - bch2_trans_exit(&trans); - - if (ret == -EINTR) - ret = 0; - - return ret; -} - static inline int range_has_data(struct bch_fs *c, struct bpos start, struct bpos end) @@ -2437,14 +2155,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, round_up(from, PAGE_SIZE)); } -static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) +static int bch2_extend(struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; int ret; - ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); + /* + * sync appends: + * + * this has to be done _before_ extending i_size: + */ + ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); if (ret) return ret; @@ -2484,19 +2208,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; + struct bch_inode_unpacked inode_u; + struct btree_trans trans; + struct btree_iter *iter; u64 new_i_size = iattr->ia_size; - bool shrink; + s64 i_sectors_delta = 0; int ret = 0; inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - BUG_ON(inode->v.i_size < inode->ei_inode.bi_size); + /* + * fetch current on disk i_size: inode is locked, i_size can only + * increase underneath us: + */ + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); + ret = PTR_ERR_OR_ZERO(iter); + bch2_trans_exit(&trans); - shrink = iattr->ia_size <= inode->v.i_size; + if (ret) + goto err; - if (!shrink) { - ret = bch2_extend(inode, iattr); + BUG_ON(inode->v.i_size < inode_u.bi_size); + + if (iattr->ia_size > inode->v.i_size) { + ret = bch2_extend(inode, &inode_u, iattr); goto err; } @@ -2514,9 +2251,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) * userspace has to redirty it and call .mkwrite -> set_page_dirty * again to allocate the part of the page that was extended. */ - if (iattr->ia_size > inode->ei_inode.bi_size) + if (iattr->ia_size > inode_u.bi_size) ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, + inode_u.bi_size, iattr->ia_size - 1); else if (iattr->ia_size & (PAGE_SIZE - 1)) ret = filemap_write_and_wait_range(mapping, @@ -2535,9 +2272,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) truncate_setsize(&inode->v, iattr->ia_size); - ret = __bch2_fpunch(c, inode, + ret = bch2_fpunch(c, inode->v.i_ino, round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX); + U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + if (unlikely(ret)) goto err; @@ -2554,7 +2293,7 @@ err: /* fallocate: */ -static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) +static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; @@ -2583,8 +2322,15 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); - if (discard_start < discard_end) - ret = __bch2_fpunch(c, inode, discard_start, discard_end); + if (discard_start < discard_end) { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode->v.i_ino, + discard_start, discard_end, + &inode->ei_journal_seq, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + } err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); @@ -2592,7 +2338,7 @@ err: return ret; } -static long bch2_fcollapse_finsert(struct bch_inode_info *inode, +static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, loff_t offset, loff_t len, bool insert) { @@ -2652,8 +2398,14 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); } else { - ret = __bch2_fpunch(c, inode, offset >> 9, - (offset + len) >> 9); + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode->v.i_ino, + offset >> 9, (offset + len) >> 9, + &inode->ei_journal_seq, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + if (ret) goto err; } @@ -2705,10 +2457,6 @@ reassemble: copy.k.k.p.offset += shift >> 9; bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); - ret = bch2_btree_iter_traverse(dst); - if (ret) - goto bkey_err; - ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); if (ret) goto bkey_err; @@ -2807,8 +2555,8 @@ err: return ret; } -static long bch2_fallocate(struct bch_inode_info *inode, int mode, - loff_t offset, loff_t len) +static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + loff_t offset, loff_t len) { struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -2819,7 +2567,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, loff_t block_start = round_down(offset, block_bytes(c)); loff_t block_end = round_up(end, block_bytes(c)); unsigned sectors; - unsigned replicas = io_opts(c, inode).data_replicas; + unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -2857,6 +2605,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, end_pos = POS(inode->v.i_ino, block_end >> 9); while (bkey_cmp(iter->pos, end_pos) < 0) { + s64 i_sectors_delta = 0; struct disk_reservation disk_res = { 0 }; struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; @@ -2910,10 +2659,10 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin_updates(&trans); - ret = bch2_extent_update(&trans, inode, - &disk_res, "a_res, - iter, &reservation.k_i, - 0, true, true, NULL); + ret = bch2_extent_update(&trans, iter, &reservation.k_i, + &disk_res, &inode->ei_journal_seq, + 0, &i_sectors_delta); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); bch2_disk_reservation_put(c, &disk_res); @@ -2922,33 +2671,49 @@ bkey_err: if (ret) goto err; } - bch2_trans_unlock(&trans); - if (!(mode & FALLOC_FL_KEEP_SIZE) && - end > inode->v.i_size) { - i_size_write(&inode->v, end); + /* + * Do we need to extend the file? + * + * If we zeroed up to the end of the file, we dropped whatever writes + * were going to write out the current i_size, so we have to extend + * manually even if FL_KEEP_SIZE was set: + */ + if (end >= inode->v.i_size && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_ZERO_RANGE))) { + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0); - mutex_unlock(&inode->ei_update_lock); - } + do { + bch2_trans_begin(&trans); + inode_iter = bch2_inode_peek(&trans, &inode_u, + inode->v.i_ino, 0); + ret = PTR_ERR_OR_ZERO(inode_iter); + } while (ret == -EINTR); + + bch2_trans_unlock(&trans); - /* blech */ - if ((mode & FALLOC_FL_KEEP_SIZE) && - (mode & FALLOC_FL_ZERO_RANGE) && - inode->ei_inode.bi_size != inode->v.i_size) { - /* sync appends.. */ - ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); if (ret) goto err; - if (inode->ei_inode.bi_size != inode->v.i_size) { - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, - inode->v.i_size, 0); - mutex_unlock(&inode->ei_update_lock); - } + /* + * Sync existing appends before extending i_size, + * as in bch2_extend(): + */ + ret = filemap_write_and_wait_range(mapping, + inode_u.bi_size, S64_MAX); + if (ret) + goto err; + + if (mode & FALLOC_FL_KEEP_SIZE) + end = inode->v.i_size; + else + i_size_write(&inode->v, end); + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, end, 0); + mutex_unlock(&inode->ei_update_lock); } err: bch2_trans_exit(&trans); @@ -2963,16 +2728,16 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_inode_info *inode = file_bch_inode(file); if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - return bch2_fallocate(inode, mode, offset, len); + return bchfs_fallocate(inode, mode, offset, len); if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - return bch2_fpunch(inode, offset, len); + return bchfs_fpunch(inode, offset, len); if (mode == FALLOC_FL_INSERT_RANGE) - return bch2_fcollapse_finsert(inode, offset, len, true); + return bchfs_fcollapse_finsert(inode, offset, len, true); if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bch2_fcollapse_finsert(inode, offset, len, false); + return bchfs_fcollapse_finsert(inode, offset, len, false); return -EOPNOTSUPP; } @@ -3001,9 +2766,12 @@ static void mark_range_unallocated(struct bch_inode_info *inode, lock_page(page); s = bch2_page_state(page); - if (s) + if (s) { + spin_lock(&s->lock); for (j = 0; j < PAGE_SECTORS; j++) s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } unlock_page(page); } @@ -3018,6 +2786,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct bch_inode_info *src = file_bch_inode(file_src); struct bch_inode_info *dst = file_bch_inode(file_dst); struct bch_fs *c = src->v.i_sb->s_fs_info; + s64 i_sectors_delta = 0; loff_t ret = 0; loff_t aligned_len; @@ -3037,6 +2806,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, bch2_lock_inodes(INODE_LOCK, src, dst); + file_update_time(file_dst); + inode_dio_wait(&src->v); inode_dio_wait(&dst->v); @@ -3047,26 +2818,40 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, file_dst, pos_dst, &len, remap_flags); if (ret < 0 || len == 0) - goto out_unlock; + goto err; aligned_len = round_up(len, block_bytes(c)); ret = write_invalidate_inode_pages_range(dst->v.i_mapping, pos_dst, pos_dst + aligned_len); if (ret) - goto out_unlock; + goto err; mark_range_unallocated(src, pos_src, pos_src + aligned_len); - ret = bch2_remap_range(c, dst, + ret = bch2_remap_range(c, POS(dst->v.i_ino, pos_dst >> 9), POS(src->v.i_ino, pos_src >> 9), aligned_len >> 9, - pos_dst + len); - if (ret > 0) - ret = min(ret << 9, len); + &dst->ei_journal_seq, + pos_dst + len, &i_sectors_delta); + if (ret < 0) + goto err; -out_unlock: + ret <<= 9; + /* + * due to alignment, we might have remapped slightly more than requsted + */ + ret = min(ret, len); + + /* XXX get a quota reservation */ + i_sectors_acct(c, dst, NULL, i_sectors_delta); + + spin_lock(&dst->v.i_lock); + if (pos_dst + len > dst->v.i_size) + i_size_write(&dst->v, pos_dst + len); + spin_unlock(&dst->v.i_lock); +err: __pagecache_block_put(&dst->v.i_mapping->add_lock); __pagecache_block_put(&src->v.i_mapping->add_lock); @@ -3299,13 +3084,13 @@ int bch2_fs_fsio_init(struct bch_fs *c) pr_verbose_init(c->opts, ""); if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.op.wbio.bio), + 4, offsetof(struct bch_writepage_io, op.wbio.bio), BIOSET_NEED_BVECS) || bioset_init(&c->dio_read_bioset, 4, offsetof(struct dio_read, rbio.bio), BIOSET_NEED_BVECS) || bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, iop.op.wbio.bio), + 4, offsetof(struct dio_write, op.wbio.bio), BIOSET_NEED_BVECS)) ret = -ENOMEM; diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index a3573232..ae171a29 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -11,16 +11,6 @@ struct quota_res; -int bch2_extent_update(struct btree_trans *, - struct bch_inode_info *, - struct disk_reservation *, - struct quota_res *, - struct btree_iter *, - struct bkey_i *, - u64, bool, bool, s64 *); -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_inode_info *, u64); - int __must_check bch2_write_inode_size(struct bch_fs *, struct bch_inode_info *, loff_t, unsigned); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 1a52a750..7209db7b 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -49,34 +49,6 @@ static void journal_seq_copy(struct bch_inode_info *dst, } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); } -/* - * I_SIZE_DIRTY requires special handling: - * - * To the recovery code, the flag means that there is stale data past i_size - * that needs to be deleted; it's used for implementing atomic appends and - * truncates. - * - * On append, we set I_SIZE_DIRTY before doing the write, then after the write - * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size - * that exposes the data we just wrote. - * - * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting - * i_size to the new smaller size, then we delete the data that we just made - * invisible, and then we clear I_SIZE_DIRTY. - * - * Because there can be multiple appends in flight at a time, we need a refcount - * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero - * refcount means I_SIZE_DIRTY is set, zero means it's cleared. - * - * Because write_inode() can be called at any time, i_size_dirty_count means - * something different to the runtime code - it means to write_inode() "don't - * update i_size yet". - * - * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when - * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must - * be set explicitly. - */ - void bch2_inode_update_after_write(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index b32c0a47..bb759a46 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -111,6 +111,15 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, } } +static inline struct bch_io_opts +io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) +{ + struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); + + bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); + return opts; +} + static inline u8 mode_to_type(umode_t mode) { return (mode >> 12) & 15; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index c6724a2f..79003dff 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -19,6 +19,7 @@ #include "ec.h" #include "error.h" #include "extents.h" +#include "inode.h" #include "io.h" #include "journal.h" #include "keylist.h" @@ -168,6 +169,258 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, mutex_unlock(&c->bio_bounce_pages_lock); } +/* Extent update path: */ + +static int sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool may_allocate, + bool *maybe_extending, + s64 *delta) +{ + struct btree_iter *iter; + struct bkey_s_c old; + int ret = 0; + + *maybe_extending = true; + *delta = 0; + + iter = bch2_trans_copy_iter(trans, extent_iter); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + if (!may_allocate && + bch2_bkey_nr_ptrs_allocated(old) < + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) { + ret = -ENOSPC; + break; + } + + *delta += (min(new->k.p.offset, + old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k))) * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + if (bkey_cmp(old.k->p, new->k.p) >= 0) { + /* + * Check if there's already data above where we're + * going to be writing to - this means we're definitely + * not extending the file: + * + * Note that it's not sufficient to check if there's + * data up to the sector offset we're going to be + * writing to, because i_size could be up to one block + * less: + */ + if (!bkey_cmp(old.k->p, new->k.p)) + old = bch2_btree_iter_next(iter); + + if (old.k && !bkey_err(old) && + old.k->p.inode == extent_iter->pos.inode && + bkey_extent_is_data(old.k)) + *maybe_extending = false; + + break; + } + } + + bch2_trans_iter_put(trans, iter); + return ret; +} + +int bch2_extent_update(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 *journal_seq, + u64 new_i_size, + s64 *i_sectors_delta) +{ + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; + bool extending = false; + s64 delta = 0; + int ret; + + ret = bch2_extent_trim_atomic(k, iter); + if (ret) + return ret; + + ret = sum_sector_overwrites(trans, iter, k, + disk_res && disk_res->sectors != 0, + &extending, &delta); + if (ret) + return ret; + + new_i_size = extending + ? min(k->k.p.offset << 9, new_i_size) + : 0; + + if (delta || new_i_size) { + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; + + inode_iter = bch2_inode_peek(trans, &inode_u, + k->k.p.inode, BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + /* + * XXX: + * writeback can race a bit with truncate, because truncate + * first updates the inode then truncates the pagecache. This is + * ugly, but lets us preserve the invariant that the in memory + * i_size is always >= the on disk i_size. + * + BUG_ON(new_i_size > inode_u.bi_size && + (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); + */ + BUG_ON(new_i_size > inode_u.bi_size && !extending); + + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) + inode_u.bi_size = new_i_size; + else + new_i_size = 0; + + inode_u.bi_sectors += delta; + + if (delta || new_i_size) { + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i); + } + + bch2_trans_iter_put(trans, inode_iter); + } + + bch2_trans_update(trans, iter, k); + + ret = bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE); + if (!ret && i_sectors_delta) + *i_sectors_delta += delta; + + return ret; +} + +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, u64 *journal_seq, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + + while ((k = bch2_btree_iter_peek(iter)).k && + bkey_cmp(iter->pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + ret = bkey_err(k); + if (ret) + goto btree_err; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete.k); + + bch2_trans_begin_updates(trans); + + ret = bch2_extent_update(trans, iter, &delete, + &disk_res, journal_seq, + 0, i_sectors_delta); + bch2_disk_reservation_put(c, &disk_res); +btree_err: + if (ret == -EINTR) { + ret2 = ret; + ret = 0; + } + if (ret) + break; + } + + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, POS(inum, end), + journal_seq, i_sectors_delta); + bch2_trans_exit(&trans); + + if (ret == -EINTR) + ret = 0; + + return ret; +} + +int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter *iter; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { + BKEY_PADDED(k) tmp; + + bkey_copy(&tmp.k, bch2_keylist_front(keys)); + + bch2_trans_begin_updates(&trans); + + ret = bch2_extent_update(&trans, iter, &tmp.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta); + if (ret == -EINTR) + continue; + if (ret) + break; + + if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) + bch2_cut_front(iter->pos, bch2_keylist_front(keys)); + else + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); + + return ret; +} + /* Writes */ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -246,58 +499,6 @@ static void bch2_write_done(struct closure *cl) closure_return(cl); } -int bch2_write_index_default(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans trans; - struct btree_iter *iter; - struct keylist *keys = &op->insert_keys; - int ret; - - BUG_ON(bch2_keylist_empty(keys)); - bch2_verify_keylist_sorted(keys); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); -retry: - bch2_trans_begin(&trans); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); - - do { - BKEY_PADDED(k) split; - - bkey_copy(&split.k, bch2_keylist_front(keys)); - - ret = bch2_extent_trim_atomic(&split.k, iter); - if (ret) - break; - - bch2_trans_update(&trans, iter, &split.k); - - ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); - if (ret) - break; - - if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); - else - bch2_keylist_pop_front(keys); - } while (!bch2_keylist_empty(keys)); - - if (ret == -EINTR) { - ret = 0; - goto retry; - } - - bch2_trans_exit(&trans); - - return ret; -} - /** * bch_write_index - after a write, update index to point to new data */ diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 80b72dbf..a72c7ccd 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -54,6 +54,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) : op->c->wq; } +int bch2_extent_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct disk_reservation *, + u64 *, u64, s64 *); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, u64 *, s64 *); +int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); + int bch2_write_index_default(struct bch_write_op *); static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, @@ -78,6 +85,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->write_point = (struct write_point_specifier) { 0 }; op->res = (struct disk_reservation) { 0 }; op->journal_seq = 0; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; op->index_update_fn = bch2_write_index_default; } diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 2d397e5e..c2c2cce0 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -132,6 +132,8 @@ struct bch_write_op { u64 *journal_seq_p; u64 journal_seq; }; + u64 new_i_size; + s64 i_sectors_delta; int (*index_update_fn)(struct bch_write_op *); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index e6015bc1..095eef38 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -272,6 +272,8 @@ retry: if (ret) goto err; + atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); + split_iter = bch2_trans_copy_iter(&trans, iter); ret = PTR_ERR_OR_ZERO(split_iter); if (ret) @@ -282,10 +284,6 @@ retry: if (ret) goto err; - ret = bch2_extent_atomic_end(split_iter, k, &atomic_end); - if (ret) - goto err; - if (!remark && remark_if_split && bkey_cmp(atomic_end, k->k.p) < 0) { diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index ad526d28..4a4b17f9 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -2,8 +2,8 @@ #include "bcachefs.h" #include "btree_update.h" #include "extents.h" -#include "fs.h" -#include "fs-io.h" +#include "inode.h" +#include "io.h" #include "reflink.h" #include @@ -70,12 +70,6 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -/* - * bch2_remap_range() depends on bch2_extent_update(), which depends on various - * things tied to the linux vfs for inode updates, for now: - */ -#ifndef NO_BCACHEFS_FS - static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i_extent *e) @@ -144,26 +138,24 @@ err: static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) { struct bkey_s_c k = bch2_btree_iter_peek(iter); + int ret; - while (1) { - if (bkey_err(k)) - return k; - + for_each_btree_key_continue(iter, 0, k, ret) { if (bkey_cmp(iter->pos, end) >= 0) return bkey_s_c_null; if (k.k->type == KEY_TYPE_extent || k.k->type == KEY_TYPE_reflink_p) - return k; - - k = bch2_btree_iter_next(iter); + break; } + + return k; } s64 bch2_remap_range(struct bch_fs *c, - struct bch_inode_info *dst_inode, struct bpos dst_start, struct bpos src_start, - u64 remap_sectors, u64 new_i_size) + u64 remap_sectors, u64 *journal_seq, + u64 new_i_size, s64 *i_sectors_delta) { struct btree_trans trans; struct btree_iter *dst_iter, *src_iter; @@ -172,7 +164,7 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos dst_end = dst_start, src_end = src_start; struct bpos dst_want, src_want; u64 src_done, dst_done; - int ret = 0; + int ret = 0, ret2 = 0; if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { mutex_lock(&c->sb_lock); @@ -215,7 +207,7 @@ s64 bch2_remap_range(struct bch_fs *c, if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ret = bch2_fpunch_at(&trans, dst_iter, dst_want, - dst_inode, new_i_size); + journal_seq, i_sectors_delta); if (ret) goto btree_err; continue; @@ -261,9 +253,9 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_iter->pos.offset, dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_inode, NULL, NULL, - dst_iter, &new_dst.k, - new_i_size, false, true, NULL); + ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, + NULL, journal_seq, + new_i_size, i_sectors_delta); if (ret) goto btree_err; @@ -284,17 +276,24 @@ err: dst_done = dst_iter->pos.offset - dst_start.offset; new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + bch2_trans_begin(&trans); + + do { + struct bch_inode_unpacked inode_u; + struct btree_iter *inode_iter; + + inode_iter = bch2_inode_peek(&trans, &inode_u, + dst_start.inode, BTREE_ITER_INTENT); + ret2 = PTR_ERR_OR_ZERO(inode_iter); + + if (!ret2 && + inode_u.bi_size < new_i_size) + ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, journal_seq, + BTREE_INSERT_ATOMIC); + } while (ret2 == -EINTR); + ret = bch2_trans_exit(&trans) ?: ret; - mutex_lock(&dst_inode->ei_update_lock); - if (dst_inode->v.i_size < new_i_size) { - i_size_write(&dst_inode->v, new_i_size); - ret = bch2_write_inode_size(c, dst_inode, new_i_size, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&dst_inode->ei_update_lock); - - return dst_done ?: ret; + return dst_done ?: ret ?: ret2; } - -#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index 327618c3..ac23b855 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -24,9 +24,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, .val_to_text = bch2_reflink_v_to_text, \ } -#ifndef NO_BCACHEFS_FS -s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *, - struct bpos, struct bpos, u64, u64); -#endif /* NO_BCACHEFS_FS */ +s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, + u64, u64 *, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index bb9da2bb..cb5ebb87 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -84,10 +84,8 @@ static void extent_to_replicas(struct bkey_s_c k, if (p.ptr.cached) continue; - if (p.ec_nr) { + if (p.has_ec) r->nr_required = 0; - break; - } r->devs[r->nr_devs++] = p.ptr.dev; } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 3043def8..550a140d 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1030,9 +1030,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); - memset(u, 0, sizeof(*u)); - u->entry.u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs, - sizeof(u64)) - 1; + int u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs, + sizeof(u64)) - 1; + memset(u, 0, u64s * sizeof(u64)); + u->entry.u64s = u64s; u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); memcpy(&u->r, e, replicas_entry_bytes(e));