diff --git a/.bcachefs_revision b/.bcachefs_revision index 8f42b139..2cf80311 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -00104032654027a8f4406a82d28911b243f19d94 +e1d0fb8c5fbc70df1007ebf5d9ab03018dc05275 diff --git a/include/linux/list.h b/include/linux/list.h index 4a317090..3639dc99 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -26,7 +26,6 @@ #define list_for_each_entry(p, h, m) cds_list_for_each_entry(p, h, m) #define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m) #define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m) -#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m) static inline int list_empty_careful(const struct list_head *head) { @@ -54,6 +53,15 @@ static inline void list_splice_init(struct list_head *list, #define list_first_entry_or_null(ptr, type, member) \ (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#define list_prev_entry(pos, member) \ + list_entry((pos)->member.prev, typeof(*(pos)), member) + +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) + /* hlists: */ #include diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 6d54defc..eb5b4080 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -214,9 +214,11 @@ dynamic_fault("bcachefs:meta:write:" name) #ifdef __KERNEL__ -#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else -#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) #endif #define bch_info(c, fmt, ...) \ @@ -229,8 +231,11 @@ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_err_ratelimited(c, fmt, ...) \ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_verbose(c, fmt, ...) \ do { \ @@ -668,6 +673,7 @@ struct bch_fs { unsigned bucket_size_max; atomic64_t sectors_available; + struct mutex sectors_available_lock; struct bch_fs_pcpu __percpu *pcpu; @@ -675,7 +681,7 @@ struct bch_fs { seqcount_t usage_lock; struct bch_fs_usage *usage_base; - struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; struct bch_fs_usage __percpu *usage_gc; /* single element mempool: */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 94b54185..02a76c3d 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1332,14 +1332,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(extents_above_btree_updates, 12) \ x(btree_updates_journalled, 13) \ x(reflink_inline_data, 14) \ - x(new_varint, 15) + x(new_varint, 15) \ + x(journal_no_flush, 16) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_new_varint))\ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1575,6 +1577,7 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); #define BCH_JOURNAL_BUCKETS_MIN 8 diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index ac81c9b9..6268ea63 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c, struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; - unsigned i; c->ec_stripes_heap.used = 0; @@ -651,8 +650,8 @@ static int bch2_gc_done(struct bch_fs *c, } }; - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); bch2_dev_usage_from_buckets(c); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 2406745f..9b19432a 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bch2_btree_iter_reinit_node(iter, b); } +static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + pr_buf(out, "%s level %u/%u\n ", + bch2_btree_ids[b->c.btree_id], + b->c.level, + c->btree_roots[b->c.btree_id].level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" - "pos ", - write ? "before write " : "", - b->c.btree_id, b->c.level, - c->btree_roots[b->c.btree_id].level); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + pr_buf(out, "error validating btree node %sat btree ", + write ? "before write " : ""); + btree_pos_to_text(out, c, b); - pr_buf(out, " node offset %u", b->written); + pr_buf(out, "\n node offset %u", b->written); if (i) pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); } @@ -1104,6 +1111,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; + char buf[200]; + struct printbuf out; bool can_retry; goto start; @@ -1123,8 +1132,10 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", - bch2_blk_status_to_str(bio->bi_status)); + out = PBUF(buf); + btree_pos_to_text(&out, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1408,7 +1419,7 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); @@ -1488,6 +1499,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!btree_node_may_write(b)) return; + if (old & (1 << BTREE_NODE_never_write)) + return; + if (old & (1 << BTREE_NODE_write_in_flight)) { btree_node_wait_on_io(b); continue; @@ -1534,6 +1548,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } + BUG_ON(b->written && !seq); + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ bytes += 8; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 15af60e9..dc7de271 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -415,6 +415,7 @@ enum btree_flags { BTREE_NODE_fake, BTREE_NODE_old_extent_overwrite, BTREE_NODE_need_rewrite, + BTREE_NODE_never_write, }; BTREE_FLAG(read_in_flight); @@ -429,6 +430,7 @@ BTREE_FLAG(dying); BTREE_FLAG(fake); BTREE_FLAG(old_extent_overwrite); BTREE_FLAG(need_rewrite); +BTREE_FLAG(never_write); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index edc11c22..4a169d36 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -603,17 +603,30 @@ err: list_del(&as->write_blocked_list); - if (!ret && as->b == b) { + /* + * Node might have been freed, recheck under + * btree_interior_update_lock: + */ + if (as->b == b) { struct bset *i = btree_bset_last(b); BUG_ON(!b->c.level); BUG_ON(!btree_node_dirty(b)); - i->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(i->journal_seq))); + if (!ret) { + i->journal_seq = cpu_to_le64( + max(journal_seq, + le64_to_cpu(i->journal_seq))); - bch2_btree_add_journal_pin(c, b, journal_seq); + bch2_btree_add_journal_pin(c, b, journal_seq); + } else { + /* + * If we didn't get a journal sequence number we + * can't write this btree node, because recovery + * won't know to ignore this write: + */ + set_btree_node_never_write(b); + } } mutex_unlock(&c->btree_interior_update_lock); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 49685523..e7816afe 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -649,13 +649,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - while (bch2_btree_key_cache_must_wait(c)) { + do { mutex_lock(&c->journal.reclaim_lock); - bch2_journal_reclaim(&c->journal); + ret = bch2_journal_reclaim(&c->journal); mutex_unlock(&c->journal.reclaim_lock); - } + } while (!ret && bch2_btree_key_cache_must_wait(c)); - if (bch2_trans_relock(trans)) + if (!ret && bch2_trans_relock(trans)) return 0; trace_trans_restart_journal_reclaim(trans->ip); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 1b1200c5..0000fc76 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -142,8 +142,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c) percpu_down_write(&c->mark_lock); usage = c->usage_base; - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); for (i = 0; i < BCH_REPLICAS_MAX; i++) usage->reserved += usage->persistent_reserved[i]; @@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, { return this_cpu_ptr(gc ? c->usage_gc - : c->usage[journal_seq & 1]); + : c->usage[journal_seq & JOURNAL_BUF_MASK]); } u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) { ssize_t offset = v - (u64 *) c->usage_base; - unsigned seq; + unsigned i, seq; u64 ret; BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); @@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) do { seq = read_seqcount_begin(&c->usage_lock); - ret = *v + - percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + - percpu_u64_get((u64 __percpu *) c->usage[1] + offset); + ret = *v; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -232,7 +233,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage *ret; - unsigned seq, v, u64s = fs_usage_u64s(c); + unsigned seq, i, v, u64s = fs_usage_u64s(c); retry: ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); if (unlikely(!ret)) @@ -251,8 +252,8 @@ retry: do { seq = read_seqcount_begin(&c->usage_lock); memcpy(ret, c->usage_base, u64s * sizeof(u64)); - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -262,7 +263,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { unsigned u64s = fs_usage_u64s(c); - BUG_ON(idx >= 2); + BUG_ON(idx >= ARRAY_SIZE(c->usage)); preempt_disable(); write_seqcount_begin(&c->usage_lock); @@ -2031,13 +2032,6 @@ int bch2_trans_mark_update(struct btree_trans *trans, /* Disk reservations: */ -static u64 bch2_recalc_sectors_available(struct bch_fs *c) -{ - percpu_u64_set(&c->pcpu->sectors_available, 0); - - return avail_factor(__bch2_fs_usage_read_short(c).free); -} - void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { percpu_down_read(&c->mark_lock); @@ -2072,7 +2066,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, if (get < sectors) { preempt_enable(); - percpu_up_read(&c->mark_lock); goto recalculate; } } while ((v = atomic64_cmpxchg(&c->sectors_available, @@ -2090,9 +2083,10 @@ out: return 0; recalculate: - percpu_down_write(&c->mark_lock); + mutex_lock(&c->sectors_available_lock); - sectors_available = bch2_recalc_sectors_available(c); + percpu_u64_set(&c->pcpu->sectors_available, 0); + sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { @@ -2106,7 +2100,8 @@ recalculate: ret = -ENOSPC; } - percpu_up_write(&c->mark_lock); + mutex_unlock(&c->sectors_available_lock); + percpu_up_read(&c->mark_lock); return ret; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index d7ba0e7f..c409a426 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) len << 9); if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { - __bcache_io_error(c, + bch_err_ratelimited(c, "checksum error while doing reconstruct read (%u:%u)", i, j); clear_bit(i, buf->valid); @@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) unsigned bytes = buf->size << 9; if (ec_nr_failed(buf) > v->nr_redundant) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); return -1; } @@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); @@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) BTREE_ITER_SLOTS); k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: stripe not found"); kfree(buf); return bch2_trans_exit(&trans) ?: -EIO; @@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (ptr_stale(ca, ptr)) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: stale pointer"); clear_bit(i, buf->valid); continue; @@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); ret = -EIO; goto err; diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 94b53312..0e49fd72 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *); /* Logs message and handles the error: */ #define bch2_dev_io_error(ca, fmt, ...) \ do { \ - printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ - "IO error on %s for " fmt), \ + printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \ (ca)->name, ##__VA_ARGS__); \ bch2_io_error(ca); \ } while (0) +#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \ +do { \ + printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\ + (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \ + bch2_io_error(ca); \ +} while (0) + #define bch2_dev_io_err_on(cond, ca, ...) \ ({ \ bool _ret = (cond); \ @@ -196,16 +202,13 @@ do { \ _ret; \ }) -/* kill? */ - -#define __bcache_io_error(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, \ - "IO error: " fmt), ##__VA_ARGS__) - -#define bcache_io_error(c, bio, fmt, ...) \ -do { \ - __bcache_io_error(c, fmt, ##__VA_ARGS__); \ - (bio)->bi_status = BLK_STS_IOERR; \ -} while (0) +#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) \ + bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\ + _ret; \ +}) #endif /* _BCACHEFS_ERROR_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 8e6f7300..8170e93c 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -856,7 +856,9 @@ retry: goto retry; if (ret) { - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, inum, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } @@ -1013,6 +1015,8 @@ static void bch2_writepage_io_done(struct closure *cl) unsigned i; if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; @@ -1902,7 +1906,13 @@ loop: bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - if (!dio->iter.count || dio->op.error) + + if (dio->op.error) { + set_bit(EI_INODE_ERROR, &inode->ei_flags); + break; + } + + if (!dio->iter.count) break; bio_reset(bio); @@ -2290,7 +2300,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (ret) goto err; - BUG_ON(inode->v.i_size < inode_u.bi_size); + WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && + inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { ret = bch2_extend(inode, &inode_u, iattr); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index f3f6fe6c..e3edca4d 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1151,6 +1151,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; + inode->ei_flags = 0; inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_str_hash = bch2_hash_info_init(c, bi); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 4ee1ac99..3df85ffb 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -33,6 +33,7 @@ void bch2_pagecache_block_get(struct pagecache_lock *); struct bch_inode_info { struct inode v; + unsigned long ei_flags; struct mutex ei_update_lock; u64 ei_journal_seq; @@ -50,6 +51,12 @@ struct bch_inode_info { struct bch_inode_unpacked ei_inode; }; +/* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: + */ +#define EI_INODE_ERROR 0 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 46856c66..3489605e 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -576,7 +576,8 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, op->pos.inode, + "write error %i from btree update", ret); op->error = ret; } } @@ -621,7 +622,10 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ + "data write error: %s", bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); @@ -1279,15 +1283,14 @@ void bch2_write(struct closure *cl) wbio_init(bio)->put_bio = false; if (bio_sectors(bio) & (c->opts.block_size - 1)) { - __bcache_io_error(c, "misaligned write"); + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); op->error = -EIO; goto err; } if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { - if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) - __bcache_io_error(c, "read only"); op->error = -EROFS; goto err; } @@ -1716,7 +1719,8 @@ retry: * reading a btree node */ BUG_ON(!ret); - __bcache_io_error(c, "btree IO error: %i", ret); + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); err: rbio->bio.bi_status = BLK_STS_IOERR; out: @@ -1920,17 +1924,15 @@ csum_err: return; } - bch2_dev_io_error(ca, - "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", - rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); return; decompression_err: - __bcache_io_error(c, "decompression error, inode %llu offset %llu", - rbio->pos.inode, - (u64) rbio->bvec_iter.bi_sector); + bch_err_inum_ratelimited(c, rbio->pos.inode, + "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); return; } @@ -1952,7 +1954,14 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", + /* + * XXX: rbio->pos is not what we want here when reading from indirect + * extents + */ + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->pos.inode, + rbio->pos.offset, + "data read error: %s", bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; @@ -2002,7 +2011,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_reflink_v && k.k->type != KEY_TYPE_indirect_inline_data) { - __bcache_io_error(trans->c, + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, "pointer to nonexistent indirect extent"); ret = -EIO; goto err; @@ -2048,7 +2057,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); goto err; } @@ -2198,7 +2208,8 @@ get_bio: if (!rbio->pick.idx) { if (!rbio->have_ioref) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -2348,7 +2359,9 @@ err: if (ret == -EINTR) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); goto out; } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index dd8db8c0..70152103 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -24,7 +24,7 @@ static u64 last_unwritten_seq(struct journal *j) lockdep_assert_held(&j->lock); - return journal_cur_seq(j) - s.prev_buf_unwritten; + return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); } static inline bool journal_seq_unwritten(struct journal *j, u64 seq) @@ -52,7 +52,7 @@ journal_seq_to_buf(struct journal *j, u64 seq) j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); if (journal_seq_unwritten(j, seq)) { - buf = j->buf + (seq & 1); + buf = j->buf + (seq & JOURNAL_BUF_MASK); EBUG_ON(le64_to_cpu(buf->data->seq) != seq); } return buf; @@ -80,6 +80,8 @@ static void bch2_journal_buf_init(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; memset(buf->has_inode, 0, sizeof(buf->has_inode)); @@ -109,15 +111,8 @@ void bch2_journal_halt(struct journal *j) /* journal entry close/open: */ -void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) +void __bch2_journal_buf_put(struct journal *j) { - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - bch2_time_stats_update(j->delay_time, - j->need_write_time); - - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } @@ -130,7 +125,6 @@ static bool __journal_entry_close(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); - bool set_need_write = false; unsigned sectors; lockdep_assert_held(&j->lock); @@ -149,15 +143,13 @@ static bool __journal_entry_close(struct journal *j) if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { set_bit(JOURNAL_NEED_WRITE, &j->flags); j->need_write_time = local_clock(); - set_need_write = true; } - if (new.prev_buf_unwritten) - return false; - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; new.idx++; - new.prev_buf_unwritten = 1; + + if (new.idx == new.unwritten_idx) + return false; BUG_ON(journal_state_count(new, new.idx)); } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -191,24 +183,44 @@ static bool __journal_entry_close(struct journal *j) */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); + clear_bit(JOURNAL_NEED_WRITE, &j->flags); bch2_journal_space_available(j); - bch2_journal_buf_put(j, old.idx, set_need_write); + bch2_journal_buf_put(j, old.idx); return true; } +static bool journal_entry_want_write(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = false; + + /* + * Don't close it yet if we already have a write in flight, but do set + * NEED_WRITE: + */ + if (s.idx != s.unwritten_idx) + set_bit(JOURNAL_NEED_WRITE, &j->flags); + else + ret = __journal_entry_close(j); + + return ret; +} + static bool journal_entry_close(struct journal *j) { bool ret; spin_lock(&j->lock); - ret = __journal_entry_close(j); + ret = journal_entry_want_write(j); spin_unlock(&j->lock); return ret; @@ -290,8 +302,8 @@ static int journal_entry_open(struct journal *j) static bool journal_quiesced(struct journal *j) { - union journal_res_state state = READ_ONCE(j->reservations); - bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); if (!ret) journal_entry_close(j); @@ -318,17 +330,29 @@ static void journal_write_work(struct work_struct *work) u64 bch2_inode_journal_seq(struct journal *j, u64 inode) { size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - u64 seq = 0; + union journal_res_state s; + unsigned i; + u64 seq; - if (!test_bit(h, j->buf[0].has_inode) && - !test_bit(h, j->buf[1].has_inode)) - return 0; spin_lock(&j->lock); - if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = journal_cur_seq(j); - else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = journal_cur_seq(j) - 1; + seq = journal_cur_seq(j); + s = READ_ONCE(j->reservations); + i = s.idx; + + while (1) { + if (test_bit(h, j->buf[i].has_inode)) + goto out; + + if (i == s.unwritten_idx) + break; + + i = (i - 1) & JOURNAL_BUF_MASK; + seq--; + } + + seq = 0; +out: spin_unlock(&j->lock); return seq; @@ -553,7 +577,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct journal_buf *buf; int ret = 0; - if (seq <= j->seq_ondisk) + if (seq <= j->flushed_seq_ondisk) return 1; spin_lock(&j->lock); @@ -564,18 +588,55 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, goto out; } - if (seq <= j->seq_ondisk) { + if (seq <= j->flushed_seq_ondisk) { ret = 1; goto out; } - if (parent && - (buf = journal_seq_to_buf(j, seq))) - if (!closure_wait(&buf->wait, parent)) + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, last_unwritten_seq(j)); + +recheck_need_open: + if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + struct journal_res res = { 0 }; + + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (parent && !closure_wait(&buf->wait, parent)) BUG(); + bch2_journal_res_put(j, &res); + + spin_lock(&j->lock); + goto want_write; + } + + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } + + buf->must_flush = true; + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: if (seq == journal_cur_seq(j)) - __journal_entry_close(j); + journal_entry_want_write(j); out: spin_unlock(&j->lock); return ret; @@ -864,15 +925,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { union journal_res_state state; - struct journal_buf *w; - bool ret; + bool ret = false; + unsigned i; spin_lock(&j->lock); state = READ_ONCE(j->reservations); - w = j->buf + !state.idx; + i = state.idx; - ret = state.prev_buf_unwritten && - bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); + while (i != state.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) + ret = true; + } spin_unlock(&j->lock); return ret; @@ -955,10 +1019,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; journal_pin_new_entry(j, 1); - j->reservations.idx = journal_cur_seq(j); + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); bch2_journal_buf_init(j); @@ -1013,8 +1078,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - kvpfree(j->buf[1].data, j->buf[1].buf_size); - kvpfree(j->buf[0].data, j->buf[0].buf_size); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) + kvpfree(j->buf[i].data, j->buf[i].buf_size); free_fifo(&j->pin); } @@ -1022,6 +1089,7 @@ int bch2_fs_journal_init(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); static struct lock_class_key res_key; + unsigned i; int ret = 0; pr_verbose_init(c->opts, ""); @@ -1036,8 +1104,6 @@ int bch2_fs_journal_init(struct journal *j) lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; @@ -1049,13 +1115,20 @@ int bch2_fs_journal_init(struct journal *j) ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { ret = -ENOMEM; goto out; } + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + if (!j->buf[i].data) { + ret = -ENOMEM; + goto out; + } + } + j->pin.front = j->pin.back = 1; out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1069,7 +1142,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; - unsigned iter; + unsigned i; rcu_read_lock(); spin_lock(&j->lock); @@ -1081,6 +1154,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" "nr direct reclaim:\t%llu\n" "nr background reclaim:\t%llu\n" "current entry sectors:\t%u\n" @@ -1092,6 +1167,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->last_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, + j->nr_flush_writes, + j->nr_noflush_writes, j->nr_direct_reclaim, j->nr_background_reclaim, j->cur_entry_sectors, @@ -1112,16 +1189,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } pr_buf(out, - "current entry refs:\t%u\n" - "prev entry unwritten:\t", - journal_state_count(s, s.idx)); + "current entry:\t\tidx %u refcount %u\n", + s.idx, journal_state_count(s, s.idx)); - if (s.prev_buf_unwritten) - pr_buf(out, "yes, ref %u sectors %u\n", - journal_state_count(s, !s.idx), - journal_prev_buf(j)->sectors); - else - pr_buf(out, "no\n"); + i = s.idx; + while (i != s.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + + pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", + i, journal_state_count(s, i), j->buf[i].sectors); + } pr_buf(out, "need write:\t\t%i\n" @@ -1129,7 +1206,21 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - for_each_member_device_rcu(ca, c, iter, + pr_buf(out, "space:\n"); + pr_buf(out, "\tdiscarded\t%u:%u\n", + j->space[journal_space_discarded].next_entry, + j->space[journal_space_discarded].total); + pr_buf(out, "\tclean ondisk\t%u:%u\n", + j->space[journal_space_clean_ondisk].next_entry, + j->space[journal_space_clean_ondisk].total); + pr_buf(out, "\tclean\t\t%u:%u\n", + j->space[journal_space_clean].next_entry, + j->space[journal_space_clean].total); + pr_buf(out, "\ttotal\t\t%u:%u\n", + j->space[journal_space_total].next_entry, + j->space[journal_space_total].total); + + for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -1139,12 +1230,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) pr_buf(out, "dev %u:\n" "\tnr\t\t%u\n" + "\tbucket size\t%u\n" "\tavailable\t%u:%u\n" - "\tdiscard_idx\t\t%u\n" - "\tdirty_idx_ondisk\t%u (seq %llu)\n" - "\tdirty_idx\t\t%u (seq %llu)\n" + "\tdiscard_idx\t%u\n" + "\tdirty_ondisk\t%u (seq %llu)\n" + "\tdirty_idx\t%u (seq %llu)\n" "\tcur_idx\t\t%u (seq %llu)\n", - iter, ja->nr, + i, ja->nr, ca->mi.bucket_size, bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free, ja->discard_idx, diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 25c68767..a6ce03a7 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j) return j->buf + j->reservations.idx; } -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); return j->pin.back - 1; } @@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) { - return idx == 0 ? s.buf0_count : s.buf1_count; + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); } static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; s->buf1_count += s->idx == 1; + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; } static inline void bch2_journal_set_has_inode(struct journal *j, @@ -255,21 +258,24 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -void __bch2_journal_buf_put(struct journal *, bool); +void __bch2_journal_buf_put(struct journal *); -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, - bool need_write_just_set) +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, }).v, &j->reservations.counter); - if (!journal_state_count(s, idx)) { - EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); - __bch2_journal_buf_put(j, need_write_just_set); - } + + EBUG_ON(((s.idx - idx) & 3) > + ((s.idx - s.unwritten_idx) & 3)); + + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); } /* @@ -289,7 +295,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, NULL, 0); - bch2_journal_buf_put(j, res->idx, false); + bch2_journal_buf_put(j, res->idx); res->ref = 0; } @@ -325,11 +331,18 @@ static inline int journal_res_get_fast(struct journal *j, !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) return 0; - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - new.cur_entry_offset += res->u64s; journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index d1367cf0..bb9a1936 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -10,10 +10,27 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include +static void __journal_replay_free(struct journal_replay *i) +{ + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(i); +} + struct journal_list { struct closure cl; struct mutex lock; @@ -36,28 +53,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); - __le64 last_seq; + u64 last_seq = 0; int ret; - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - if (!c->opts.read_entire_journal) { - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + break; } + } - /* Drop entries we don't need anymore */ + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < last_seq) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + if (!JSET_NO_FLUSH(j)) { list_for_each_entry_safe(i, pos, jlist->head, list) { if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + journal_replay_free(c, i); } } @@ -81,9 +99,7 @@ add: if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { if (i->bad) { devs = i->devs; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + __journal_replay_free(i); } else if (bad) { goto found; } else { @@ -105,6 +121,7 @@ add: list_add(&i->list, where); i->devs = devs; i->bad = bad; + i->ignore = false; memcpy(&i->j, j, bytes); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -558,7 +575,7 @@ reread: bio_put(bio); if (bch2_dev_io_err_on(ret, ca, - "journal read from sector %llu", + "journal read error: sector %llu", offset) || bch2_meta_read_fault("journal")) return -EIO; @@ -699,14 +716,16 @@ err: goto out; } -int bch2_journal_read(struct bch_fs *c, struct list_head *list) +int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i; + struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; + u64 seq, last_seq = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -735,12 +754,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; + if (list_empty(list)) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + i = list_last_entry(list, struct journal_replay, list); + *start_seq = le64_to_cpu(i->j.seq) + 1; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + list_for_each_entry_safe_reverse(i, t, list, list) { + if (i->ignore) + continue; + + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + journal_replay_free(c, i); + } + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return -1; + } + + /* Drop blacklisted entries and entries older than last_seq: */ + list_for_each_entry_safe(i, t, list, list) { + if (i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + + journal_replay_free(c, i); + } + } + + /* Check for missing entries: */ + seq = last_seq; + list_for_each_entry(i, list, list) { + if (i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + last_seq, *blacklist_seq - 1); + } + + seq++; + } + list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; + if (i->ignore) + continue; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -768,12 +872,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) entries++; } - if (!list_empty(list)) { - i = list_last_entry(list, struct journal_replay, list); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, *start_seq); - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, le64_to_cpu(i->j.seq)); - } + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); fsck_err: return ret; } @@ -951,16 +1055,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) buf->buf_size = new_size; } +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ + return j->buf + j->reservations.unwritten_idx; +} + static void journal_write_done(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_devs_list devs = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); struct bch_replicas_padded replicas; + union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); u64 last_seq = le64_to_cpu(w->data->last_seq); + u64 v; int err = 0; bch2_time_stats_update(j->write_time, j->write_start_time); @@ -984,8 +1095,12 @@ static void journal_write_done(struct closure *cl) j->seq_ondisk = seq; if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; - j->last_seq_ondisk = last_seq; - bch2_journal_space_available(j); + + if (!w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + } /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -999,9 +1114,14 @@ static void journal_write_done(struct closure *cl) /* also must come before signalling write completion: */ closure_debug_destroy(cl); - BUG_ON(!j->reservations.prev_buf_unwritten); - atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, - &j->reservations.counter); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(new.idx == new.unwritten_idx); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); closure_wake_up(&w->wait); journal_wake(j); @@ -1009,6 +1129,10 @@ static void journal_write_done(struct closure *cl) if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); spin_unlock(&j->lock); + + if (new.unwritten_idx != new.idx && + !journal_state_count(new, new.unwritten_idx)) + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } static void journal_write_endio(struct bio *bio) @@ -1016,10 +1140,10 @@ static void journal_write_endio(struct bio *bio) struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); @@ -1036,7 +1160,7 @@ void bch2_journal_write(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct jset_entry *start, *end; struct jset *jset; struct bio *bio; @@ -1047,13 +1171,27 @@ void bch2_journal_write(struct closure *cl) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); - journal_buf_realloc(j, w); jset = w->data; j->write_start_time = local_clock(); + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = cpu_to_le64(j->last_seq_ondisk); + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + } + spin_unlock(&j->lock); + /* * New btree roots are set by journalling them; when the journal entry * gets written we have to propagate them to c->btree_roots @@ -1175,8 +1313,9 @@ retry_alloc: bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + if (!JSET_NO_FLUSH(jset)) + bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; bch2_bio_map(bio, jset, sectors << 9); trace_journal_write(bio); @@ -1185,20 +1324,21 @@ retry_alloc: ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_FLUSH; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } + if (!JSET_NO_FLUSH(jset)) { + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_opf = REQ_OP_FLUSH; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } no_io: bch2_bucket_seq_cleanup(c); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 6958ee0f..6b4c8096 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -11,6 +11,7 @@ struct journal_replay { struct bch_devs_list devs; /* checksum error, but we may want to try using it anyways: */ bool bad; + bool ignore; /* must be last: */ struct jset j; }; @@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_read(struct bch_fs *, struct list_head *); +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index beaa39f7..9d778306 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -58,81 +58,107 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) old.v, new.v)) != old.v); } -static struct journal_space { - unsigned next_entry; - unsigned remaining; -} __journal_space_available(struct journal *j, unsigned nr_devs_want, +static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) +{ + unsigned sectors = 0; + + while (!sectors && *idx != j->reservations.idx) { + sectors = j->buf[*idx].sectors; + + *idx = (*idx + 1) & JOURNAL_BUF_MASK; + } + + return sectors; +} + +static struct journal_space +journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) +{ + struct journal_device *ja = &ca->journal; + unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; + + if (from == journal_space_total) + return (struct journal_space) { + .next_entry = ca->mi.bucket_size, + .total = ca->mi.bucket_size * ja->nr, + }; + + buckets = bch2_journal_dev_buckets_available(j, ja, from); + sectors = ja->sectors_free; + + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + while ((unwritten = get_unwritten_sectors(j, &idx))) { + if (unwritten >= sectors) { + if (!buckets) { + sectors = 0; + break; + } + + buckets--; + sectors = ca->mi.bucket_size; + } + + sectors -= unwritten; + } + + if (sectors < ca->mi.bucket_size && buckets) { + buckets--; + sectors = ca->mi.bucket_size; + } + + return (struct journal_space) { + .next_entry = sectors, + .total = sectors + buckets * ca->mi.bucket_size, + }; +} + +static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, enum journal_space_from from) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned sectors_next_entry = UINT_MAX; - unsigned sectors_total = UINT_MAX; - unsigned i, nr_devs = 0; - unsigned unwritten_sectors = j->reservations.prev_buf_unwritten - ? journal_prev_buf(j)->sectors - : 0; + unsigned i, pos, nr_devs = 0; + struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); rcu_read_lock(); for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_journal]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_this_device, sectors_this_device; - - if (!ja->nr) + if (!ca->journal.nr) continue; - buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); - sectors_this_device = ja->sectors_free; - - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - if (unwritten_sectors >= sectors_this_device) { - if (!buckets_this_device) - continue; - - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; - } - - sectors_this_device -= unwritten_sectors; - - if (sectors_this_device < ca->mi.bucket_size && - buckets_this_device) { - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; - } - - if (!sectors_this_device) + space = journal_dev_space_available(j, ca, from); + if (!space.next_entry) continue; - sectors_next_entry = min(sectors_next_entry, - sectors_this_device); + for (pos = 0; pos < nr_devs; pos++) + if (space.total > dev_space[pos].total) + break; - sectors_total = min(sectors_total, - buckets_this_device * ca->mi.bucket_size + - sectors_this_device); - - nr_devs++; + array_insert_item(dev_space, nr_devs, pos, space); } rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; - return (struct journal_space) { - .next_entry = sectors_next_entry, - .remaining = max_t(int, 0, sectors_total - sectors_next_entry), - }; + /* + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: + */ + return dev_space[nr_devs_want - 1]; } void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_space discarded, clean_ondisk, clean; + unsigned clean, clean_ondisk, total; unsigned overhead, u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); @@ -173,27 +199,33 @@ void bch2_journal_space_available(struct journal *j) goto out; } - if (!fifo_free(&j->pin)) { - ret = cur_entry_journal_pin_full; - goto out; - } - nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); - clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); - clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + for (i = 0; i < journal_space_nr; i++) + j->space[i] = __journal_space_available(j, nr_devs_want, i); - if (!discarded.next_entry) + clean_ondisk = j->space[journal_space_clean_ondisk].total; + clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; + + if (!j->space[journal_space_discarded].next_entry) ret = cur_entry_journal_full; + else if (!fifo_free(&j->pin)) + ret = cur_entry_journal_pin_full; - overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * + if ((clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean )) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + + overhead = DIV_ROUND_UP(clean, max_entry_size) * journal_entry_overhead(j); - u64s_remaining = clean.remaining << 6; + u64s_remaining = clean << 6; u64s_remaining = max_t(int, 0, u64s_remaining - overhead); u64s_remaining /= 4; out: - j->cur_entry_sectors = !ret ? discarded.next_entry : 0; + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; journal_set_remaining(j, u64s_remaining); journal_check_may_get_unreserved(j); @@ -277,6 +309,14 @@ static void bch2_journal_reclaim_fast(struct journal *j) bch2_journal_space_available(j); } +void __bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) + bch2_journal_reclaim_fast(j); +} + void bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); @@ -485,13 +525,14 @@ static u64 journal_seq_to_flush(struct journal *j) * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -static void __bch2_journal_reclaim(struct journal *j, bool direct) +static int __bch2_journal_reclaim(struct journal *j, bool direct) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush, nr_flushed = 0; size_t min_nr; unsigned flags; + int ret = 0; /* * We can't invoke memory reclaim while holding the reclaim_lock - @@ -506,6 +547,11 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct) if (kthread && kthread_should_stop()) break; + if (bch2_journal_error(j)) { + ret = -EIO; + break; + } + bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -547,27 +593,30 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct) } while (min_nr); memalloc_noreclaim_restore(flags); + + return ret; } -void bch2_journal_reclaim(struct journal *j) +int bch2_journal_reclaim(struct journal *j) { - __bch2_journal_reclaim(j, true); + return __bch2_journal_reclaim(j, true); } static int bch2_journal_reclaim_thread(void *arg) { struct journal *j = arg; unsigned long next; + int ret = 0; set_freezable(); kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); - while (!kthread_should_stop()) { + while (!ret && !kthread_should_stop()) { j->reclaim_kicked = false; mutex_lock(&j->reclaim_lock); - __bch2_journal_reclaim(j, false); + ret = __bch2_journal_reclaim(j, false); mutex_unlock(&j->reclaim_lock); next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index e2535504..f02caa3d 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -4,12 +4,6 @@ #define JOURNAL_PIN (32 * 1024) -enum journal_space_from { - journal_space_discarded, - journal_space_clean_ondisk, - journal_space_clean, -}; - static inline void journal_reclaim_kick(struct journal *j) { struct task_struct *p = READ_ONCE(j->reclaim_thread); @@ -39,6 +33,7 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } +void __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); @@ -73,7 +68,7 @@ static inline void bch2_journal_pin_update(struct journal *j, u64 seq, void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); void bch2_journal_do_discards(struct journal *); -void bch2_journal_reclaim(struct journal *); +int bch2_journal_reclaim(struct journal *); void bch2_journal_reclaim_stop(struct journal *); int bch2_journal_reclaim_start(struct journal *); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index d0f1bbf8..e1b63f38 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -118,7 +118,7 @@ out_write_sb: out: mutex_unlock(&c->sb_lock); - return ret; + return ret ?: bch2_blacklist_table_initialize(c); } static int journal_seq_blacklist_table_cmp(const void *_l, @@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); - BUG_ON(c->journal_seq_blacklist_table); - if (!bl) return 0; @@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) journal_seq_blacklist_table_cmp, NULL); + kfree(c->journal_seq_blacklist_table); c->journal_seq_blacklist_table = t; return 0; } diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 4640bb86..308b899b 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -9,11 +9,13 @@ #include "super_types.h" #include "fifo.h" -struct journal_res; +#define JOURNAL_BUF_BITS 2 +#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) +#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) /* - * We put two of these in struct journal; we used them for writes to the - * journal that are being staged or in flight. + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. */ struct journal_buf { struct jset *data; @@ -27,6 +29,8 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -81,10 +85,12 @@ union journal_res_state { struct { u64 cur_entry_offset:20, - idx:1, - prev_buf_unwritten:1, - buf0_count:21, - buf1_count:21; + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; }; }; @@ -116,6 +122,20 @@ union journal_preres_state { #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) +struct journal_space { + /* Units of 512 bytes sectors: */ + unsigned next_entry; /* How big the next journal entry can be */ + unsigned total; +}; + +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, + journal_space_total, + journal_space_nr, +}; + /* * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, * either because something's waiting on the write to complete or because it's @@ -128,6 +148,7 @@ enum { JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, }; /* Embedded in struct bch_fs */ @@ -165,7 +186,7 @@ struct journal { * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ - struct journal_buf buf[2]; + struct journal_buf buf[JOURNAL_BUF_NR]; spinlock_t lock; @@ -185,6 +206,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; + u64 flushed_seq_ondisk; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; @@ -210,6 +232,8 @@ struct journal { struct journal_entry_pin_list *data; } pin; + struct journal_space space[journal_space_nr]; + u64 replay_journal_seq; u64 replay_journal_seq_end; @@ -232,11 +256,15 @@ struct journal { unsigned write_delay_ms; unsigned reclaim_delay_ms; + unsigned long last_flush_write; u64 res_get_blocked_start; u64 need_write_time; u64 write_start_time; + u64 nr_flush_writes; + u64 nr_noflush_writes; + struct time_stats *write_time; struct time_stats *delay_time; struct time_stats *blocked_time; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index d24cef2b..ecd51d45 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) static struct journal_keys journal_keys_sort(struct list_head *journal_entries) { - struct journal_replay *p; + struct journal_replay *i; struct jset_entry *entry; struct bkey_i *k, *_n; struct journal_keys keys = { NULL }; @@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (list_empty(journal_entries)) return keys; - keys.journal_seq_base = - le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, list)->j.last_seq); - - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + if (!keys.journal_seq_base) + keys.journal_seq_base = le64_to_cpu(i->j.seq); + + for_each_jset_key(k, _n, entry, &i->j) nr_keys++; } - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); if (!keys.d) goto err; - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); + + for_each_jset_key(k, _n, entry, &i->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, - .journal_seq = le64_to_cpu(p->j.seq) - + .journal_seq = le64_to_cpu(i->j.seq) - keys.journal_seq_base, - .journal_offset = k->_data - p->j._data, + .journal_offset = k->_data - i->j._data, }; } @@ -643,46 +643,6 @@ err: return ret; } -static bool journal_empty(struct list_head *journal) -{ - return list_empty(journal) || - journal_entry_empty(&list_last_entry(journal, - struct journal_replay, list)->j); -} - -static int -verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, - struct list_head *journal) -{ - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); - u64 start_seq = le64_to_cpu(i->j.last_seq); - u64 end_seq = le64_to_cpu(i->j.seq); - u64 seq = start_seq; - int ret = 0; - - list_for_each_entry(i, journal, list) { - if (le64_to_cpu(i->j.seq) < start_seq) - continue; - - fsck_err_on(seq != le64_to_cpu(i->j.seq), c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - seq, le64_to_cpu(i->j.seq) - 1, - start_seq, end_seq); - - seq = le64_to_cpu(i->j.seq); - - fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, - "found blacklisted journal entry %llu", seq); - - do { - seq++; - } while (bch2_journal_seq_is_blacklisted(c, seq, false)); - } -fsck_err: - return ret; -} - /* journal replay early: */ static int journal_replay_entry_early(struct bch_fs *c, @@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean, struct list_head *journal) { + struct journal_replay *i; struct jset_entry *entry; int ret; @@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c, return ret; } } else { - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); + list_for_each_entry(i, journal, list) { + if (i->ignore) + continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - list_for_each_entry(i, journal, list) vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) return ret; } + } } bch2_fs_usage_initialize(c); @@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c, struct bch_sb_field_clean *clean = *cleanp; int ret = 0; - if (!c->sb.clean || !j) - return 0; - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", le64_to_cpu(clean->journal_seq), @@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; - u64 journal_seq; + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; bool write_sb = false, need_write_alloc = false; int ret; @@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { - struct jset *j; + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } - ret = bch2_journal_read(c, &c->journal_entries); + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + struct journal_replay *i; + + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); if (ret) goto err; - if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, + list_for_each_entry_reverse(i, &c->journal_entries, list) + if (!i->ignore) { + last_journal_entry = &i->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } - if (!c->sb.clean && list_empty(&c->journal_entries)) { - bch_err(c, "no journal entries found"); - ret = BCH_FSCK_REPAIR_IMPOSSIBLE; - goto err; + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, "no journal entries found"); + goto use_clean; } c->journal_keys = journal_keys_sort(&c->journal_entries); @@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - j = &list_last_entry(&c->journal_entries, - struct journal_replay, list)->j; - - ret = verify_superblock_clean(c, &clean, j); - if (ret) + if (c->sb.clean && last_journal_entry) { + ret = verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } + } else { +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; goto err; - journal_seq = le64_to_cpu(j->seq) + 1; - } else { - journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } if (!c->sb.clean && @@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (!c->sb.clean) { + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, - journal_seq, - journal_seq + 4); + blacklist_seq, journal_seq); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; } - - journal_seq += 4; - - /* - * The superblock needs to be written before we do any btree - * node writes: it will be in the read_write() path - */ - } - - ret = bch2_blacklist_table_initialize(c); - - if (!list_empty(&c->journal_entries)) { - ret = verify_journal_entries_not_blacklisted_or_missing(c, - &c->journal_entries); - if (ret) - goto err; } ret = bch2_fs_journal_start(&c->journal, journal_seq, diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 91518c0d..00a197b6 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -275,53 +275,55 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, static int replicas_table_update(struct bch_fs *c, struct bch_replicas_cpu *new_r) { - struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; + struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; struct bch_fs_usage *new_scratch = NULL; struct bch_fs_usage __percpu *new_gc = NULL; struct bch_fs_usage *new_base = NULL; - unsigned bytes = sizeof(struct bch_fs_usage) + + unsigned i, bytes = sizeof(struct bch_fs_usage) + sizeof(u64) * new_r->nr; - int ret = -ENOMEM; + int ret = 0; + + memset(new_usage, 0, sizeof(new_usage)); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, + sizeof(u64), GFP_NOIO))) + goto err; if (!(new_base = kzalloc(bytes, GFP_NOIO)) || - !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO)) || - !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO)) || !(new_scratch = kmalloc(bytes, GFP_NOIO)) || (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { - bch_err(c, "error updating replicas table: memory allocation failure"); + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) goto err; - } + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (c->usage[i]) + __replicas_table_update_pcpu(new_usage[i], new_r, + c->usage[i], &c->replicas); if (c->usage_base) __replicas_table_update(new_base, new_r, c->usage_base, &c->replicas); - if (c->usage[0]) - __replicas_table_update_pcpu(new_usage[0], new_r, - c->usage[0], &c->replicas); - if (c->usage[1]) - __replicas_table_update_pcpu(new_usage[1], new_r, - c->usage[1], &c->replicas); if (c->usage_gc) __replicas_table_update_pcpu(new_gc, new_r, c->usage_gc, &c->replicas); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + swap(c->usage[i], new_usage[i]); swap(c->usage_base, new_base); - swap(c->usage[0], new_usage[0]); - swap(c->usage[1], new_usage[1]); swap(c->usage_scratch, new_scratch); swap(c->usage_gc, new_gc); swap(c->replicas, *new_r); - ret = 0; -err: +out: free_percpu(new_gc); kfree(new_scratch); free_percpu(new_usage[1]); free_percpu(new_usage[0]); kfree(new_base); return ret; +err: + bch_err(c, "error updating replicas table: memory allocation failure"); + ret = -ENOMEM; + goto out; } static unsigned reserve_journal_replicas(struct bch_fs *c, @@ -496,9 +498,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) struct bch_replicas_cpu n; if (!__replicas_has_entry(&c->replicas_gc, e) && - (c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i]))) { + bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { n = cpu_replicas_add_entry(&c->replicas_gc, e); if (!n.entries) { ret = -ENOSPC; @@ -603,9 +603,7 @@ retry: cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || - c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i])) + bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index cee6cc93..abe46c53 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -636,7 +636,7 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -995,10 +995,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, percpu_down_write(&c->mark_lock); if (!journal_seq) { - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); } else { - bch2_fs_usage_acc_to_base(c, journal_seq & 1); + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); } { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index e3bbd0b0..651fbc5d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -475,8 +475,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); kfree(c->usage_scratch); - free_percpu(c->usage[1]); - free_percpu(c->usage[0]); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + free_percpu(c->usage[i]); kfree(c->usage_base); if (c->btree_iters_bufs) @@ -716,6 +716,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init_early(&c->btree_cache); + mutex_init(&c->sectors_available_lock); + if (percpu_init_rwsem(&c->mark_lock)) goto err;