diff --git a/.bcachefs_revision b/.bcachefs_revision index 8226b3a6..71e83e28 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -5242db9aec10220b6ee7162ba7bec173417348cf +bf340e68c74cdb70c692698ef7367b9dc6f6e61f diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 295dcd60..8f10d13b 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -346,6 +346,52 @@ TRACE_EVENT(btree_cache_scan, __entry->ret) ); +TRACE_EVENT(btree_node_relock_fail, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos, + unsigned long node, + u32 iter_lock_seq, + u32 node_lock_seq), + TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq), + + TP_STRUCT__entry( + __array(char, trans_fn, 24 ) + __array(char, caller, 32 ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) + __field(u32, pos_snapshot ) + __field(unsigned long, node ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; + __entry->pos_snapshot = pos->snapshot; + __entry->node = node; + __entry->iter_lock_seq = iter_lock_seq; + __entry->node_lock_seq = node_lock_seq; + ), + + TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", + __entry->trans_fn, + __entry->caller, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->node, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); + /* Garbage collection */ DEFINE_EVENT(btree_node, btree_gc_rewrite_node, @@ -621,7 +667,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_STRUCT__entry( __array(char, trans_fn, 24 ) - __field(unsigned long, caller_ip ) + __array(char, caller, 32 ) __field(u8, btree_id ) __field(u64, pos_inode ) __field(u64, pos_offset ) @@ -630,16 +676,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_fast_assign( strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; + snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); __entry->btree_id = btree_id; __entry->pos_inode = pos->inode; __entry->pos_offset = pos->offset; __entry->pos_snapshot = pos->snapshot; ), - TP_printk("%s %pS btree %u pos %llu:%llu:%u", + TP_printk("%s %s btree %u pos %llu:%llu:%u", __entry->trans_fn, - (void *) __entry->caller_ip, + __entry->caller, __entry->btree_id, __entry->pos_inode, __entry->pos_offset, @@ -694,6 +740,54 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); + DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, TP_PROTO(const char *trans_fn, unsigned long caller_ip, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 688a53b4..7ad16c21 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -9,6 +9,7 @@ #include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "ec.h" @@ -463,19 +464,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, { struct bch_fs *c = trans->c; struct btree_iter iter; + struct bkey_s_c k; struct bkey_alloc_unpacked u; u64 *time, now; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto out; - u = alloc_mem_to_key(c, &iter); + u = bch2_alloc_unpack(k); time = rw == READ ? &u.read_time : &u.write_time; now = atomic64_read(&c->io_clock[rw].now); @@ -542,7 +544,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, u64 now, u64 last_seq_ondisk) { - unsigned used = bucket_sectors_used(m); + unsigned used = m.cached_sectors; if (used) { /* @@ -561,8 +563,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, * keys when there's only a small difference, so that we can * keep sequential buckets together: */ - return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| - (bucket_gc_gen(g) >> 4); + return bucket_gc_gen(g) >> 4; } } @@ -611,6 +612,13 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) if (!bch2_can_invalidate_bucket(ca, b, m)) continue; + if (!m.data_type && + bch2_bucket_needs_journal_commit(c, last_seq_ondisk, + ca->dev_idx, b)) { + ca->buckets_waiting_on_journal++; + continue; + } + if (e.nr && e.bucket + e.nr == b && e.key == key) { e.nr++; } else { @@ -647,6 +655,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_needs_gc = 0; ca->inc_gen_really_needs_gc = 0; + ca->buckets_waiting_on_journal = 0; find_reclaimable_buckets_lru(c, ca); @@ -658,56 +667,34 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) return nr; } -/* - * returns sequence number of most recent journal entry that updated this - * bucket: - */ -static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -{ - if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - return bucket_seq; - } else { - return 0; - } -} - static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b) + struct bch_dev *ca, u64 b, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; - struct bkey_alloc_unpacked u; struct btree_iter iter; + struct bkey_s_c k; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto err; - u = alloc_mem_to_key(c, &iter); + *u = bch2_alloc_unpack(k); + u->gen++; + u->data_type = 0; + u->dirty_sectors = 0; + u->cached_sectors = 0; + u->read_time = atomic64_read(&c->io_clock[READ].now); + u->write_time = atomic64_read(&c->io_clock[WRITE].now); - u.gen++; - u.data_type = 0; - u.dirty_sectors = 0; - u.cached_sectors = 0; - u.read_time = atomic64_read(&c->io_clock[READ].now); - u.write_time = atomic64_read(&c->io_clock[WRITE].now); - - ret = bch2_alloc_write(trans, &iter, &u, + ret = bch2_alloc_write(trans, &iter, u, BTREE_TRIGGER_BUCKET_INVALIDATE); err: bch2_trans_iter_exit(trans, &iter); @@ -717,21 +704,24 @@ err: static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq, unsigned flags) { - struct bucket *g; - struct bucket_mark m; + struct bkey_alloc_unpacked u; size_t b; + u64 commit_seq = 0; int ret = 0; + /* + * If the read-only path is trying to shut down, we can't be generating + * new btree updates: + */ + if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) + return 1; + BUG_ON(!ca->alloc_heap.used || !ca->alloc_heap.data[0].nr); b = ca->alloc_heap.data[0].bucket; /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - g = bucket(ca, b); - m = READ_ONCE(g->mark); - - BUG_ON(m.dirty_sectors); bch2_mark_alloc_bucket(c, ca, b, true); @@ -740,38 +730,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!fifo_push(&ca->free_inc, b)); spin_unlock(&c->freelist_lock); - /* - * If we're not invalidating cached data, we only increment the bucket - * gen in memory here, the incremented gen will be updated in the btree - * by bch2_trans_mark_pointer(): - */ - if (!m.cached_sectors && - !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { - BUG_ON(m.data_type); - bucket_cmpxchg(g, m, m.gen++); - *bucket_gen(ca, b) = m.gen; - percpu_up_read(&c->mark_lock); - goto out; - } - percpu_up_read(&c->mark_lock); - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { - ret = 1; - goto out; - } - - ret = bch2_trans_do(c, NULL, journal_seq, + ret = bch2_trans_do(c, NULL, &commit_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_JOURNAL_RESERVED| flags, - bucket_invalidate_btree(&trans, ca, b)); -out: + bucket_invalidate_btree(&trans, ca, b, &u)); + if (!ret) { /* remove from alloc_heap: */ struct alloc_heap_entry e, *top = ca->alloc_heap.data; @@ -783,11 +750,17 @@ out: heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); /* - * Make sure we flush the last journal entry that updated this - * bucket (i.e. deleting the last reference) before writing to - * this bucket again: + * If we invalidating cached data then we need to wait on the + * journal commit: */ - *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); + if (u.data_type) + *journal_seq = max(*journal_seq, commit_seq); + + /* + * We already waiting on u.alloc_seq when we filtered out + * buckets that need journal commit: + */ + BUG_ON(*journal_seq > u.journal_seq); } else { size_t b2; @@ -954,8 +927,14 @@ static int bch2_allocator_thread(void *arg) gc_count = c->gc_count; nr = find_reclaimable_buckets(c, ca); - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); + if (!nr && ca->buckets_waiting_on_journal) { + ret = bch2_journal_flush(&c->journal); + if (ret) + goto stop; + } else if (nr < (ca->mi.nbuckets >> 6) && + ca->buckets_waiting_on_journal >= nr / 2) { + bch2_journal_flush_async(&c->journal, NULL); + } if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ca->inc_gen_really_needs_gc) && @@ -963,6 +942,9 @@ static int bch2_allocator_thread(void *arg) atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); } + + trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, + ca->inc_gen_really_needs_gc); } ret = bch2_invalidate_buckets(c, ca); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index c64db2bf..a28ddcd5 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -355,6 +355,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" #include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" #include "clock_types.h" #include "ec_types.h" #include "journal_types.h" @@ -482,6 +483,7 @@ struct bch_dev { size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; + size_t buckets_waiting_on_journal; enum allocator_states allocator_state; @@ -777,6 +779,8 @@ struct bch_fs { struct mutex write_points_hash_lock; unsigned write_points_nr; + struct buckets_waiting_for_journal buckets_waiting_for_journal; + /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index fc6c4d4c..986d08d7 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -666,6 +666,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * been freed: */ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + trace_trans_restart_relock_parent_for_fill(trans->fn, + _THIS_IP_, btree_id, &path->pos); btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -713,6 +715,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { + trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, + btree_id, &path->pos); btree_trans_restart(trans); return ERR_PTR(-EINTR); } diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index a201052e..809c9a76 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -604,8 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (data_type == BCH_DATA_btree) { g2->_mark.data_type = g->_mark.data_type = data_type; - g2->gen_valid = g->gen_valid = true; set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; } @@ -1327,12 +1327,6 @@ static int bch2_gc_start(struct bch_fs *c, percpu_down_write(&c->mark_lock); - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); - for_each_member_device(ca, c, i) { struct bucket_array *dst = __bucket_array(ca, 1); struct bucket_array *src = __bucket_array(ca, 0); @@ -1360,6 +1354,27 @@ static int bch2_gc_start(struct bch_fs *c, return 0; } +static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = __bucket_array(ca, true); + struct bucket *g; + + for_each_bucket(g, buckets) { + if (metadata_only && + (g->mark.data_type == BCH_DATA_user || + g->mark.data_type == BCH_DATA_cached || + g->mark.data_type == BCH_DATA_parity)) + continue; + g->_mark.dirty_sectors = 0; + g->_mark.cached_sectors = 0; + } + }; +} + static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1430,6 +1445,55 @@ fsck_err: return ret; } +static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + c->reflink_gc_nr = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { + ret = -ENOMEM; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + +static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct genradix_iter iter; + struct reflink_gc *r; + + genradix_for_each(&c->reflink_gc_table, iter, r) + r->refcount = 0; +} + static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1493,43 +1557,10 @@ fsck_err: return ret; } -static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, - bool metadata_only) +static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, + bool metadata_only) { - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct reflink_gc *r; - int ret = 0; - - if (metadata_only) - return 0; - - bch2_trans_init(&trans, c, 0, 0); - c->reflink_gc_nr = 0; - - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -ENOMEM; - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - return ret; + genradix_free(&c->gc_stripes); } /** @@ -1565,11 +1596,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) /* flush interior btree updates: */ closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); -again: + ret = bch2_gc_start(c, metadata_only) ?: bch2_gc_reflink_start(c, initial, metadata_only); if (ret) goto out; +again: + gc_pos_set(c, gc_phase(GC_PHASE_START)); bch2_mark_superblocks(c); @@ -1607,25 +1640,26 @@ again: if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || (!iter && bch2_test_restart_gc)) { + if (iter++ > 2) { + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + goto out; + } + /* * XXX: make sure gens we fixed got saved */ - if (iter++ <= 2) { - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - percpu_down_write(&c->mark_lock); - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); + bch2_gc_stripes_reset(c, initial, metadata_only); + bch2_gc_alloc_reset(c, initial, metadata_only); + bch2_gc_reflink_reset(c, initial, metadata_only); - goto again; - } - - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); + goto again; } out: if (!ret) { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 2ae4e523..efe9b8cb 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -178,19 +178,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, int want = __btree_lock_want(path, level); if (!is_btree_node(path, level)) - return false; + goto fail; if (race_fault()) - return false; + goto fail; if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, b, level, want))) { mark_btree_node_locked(path, level, want); return true; - } else { - return false; } +fail: + trace_btree_node_relock_fail(trans->fn, _RET_IP_, + path->btree_id, + &path->pos, + (unsigned long) b, + path->l[level].lock_seq, + is_btree_node(path, level) ? b->c.lock.state.seq : 0); + return false; } bool bch2_btree_node_upgrade(struct btree_trans *trans, @@ -237,7 +243,7 @@ success: static inline bool btree_path_get_locks(struct btree_trans *trans, struct btree_path *path, - bool upgrade, unsigned long trace_ip) + bool upgrade) { unsigned l = path->level; int fail_idx = -1; @@ -440,6 +446,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans, if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(path); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, + path->btree_id, &path->pos); btree_trans_restart(trans); return false; } @@ -452,10 +460,13 @@ __flatten static bool bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - bool ret = btree_path_get_locks(trans, path, false, trace_ip); + bool ret = btree_path_get_locks(trans, path, false); - if (!ret) + if (!ret) { + trace_trans_restart_relock_path(trans->fn, trace_ip, + path->btree_id, &path->pos); btree_trans_restart(trans); + } return ret; } @@ -469,7 +480,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, path->locks_want = new_locks_want; - if (btree_path_get_locks(trans, path, true, _THIS_IP_)) + if (btree_path_get_locks(trans, path, true)) return true; /* @@ -497,7 +508,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, _THIS_IP_); + btree_path_get_locks(trans, linked, true); } return false; @@ -701,9 +712,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && - iter->pos.snapshot != iter->snapshot); - BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); @@ -711,6 +719,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(iter->btree_id)); + if (iter->update_path) + bch2_btree_path_verify(trans, iter->update_path); bch2_btree_path_verify(trans, iter->path); } @@ -1962,7 +1972,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, locks_want = min(locks_want, BTREE_MAX_DEPTH); if (locks_want > path->locks_want) { path->locks_want = locks_want; - btree_path_get_locks(trans, path, true, _THIS_IP_); + btree_path_get_locks(trans, path, true); } return path; @@ -2099,6 +2109,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) __bch2_btree_path_unlock(path); path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; + trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, + path->btree_id, &path->pos); btree_trans_restart(trans); ret = -EINTR; goto err; @@ -2182,6 +2194,23 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } +static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if ((cmp_int(btree_id, i->btree_id) ?: + bpos_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->btree_id) + return i->k; + break; + } + + return NULL; +} + static noinline struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, struct btree_path *path) @@ -2218,21 +2247,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, return k; } -/** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position - */ -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bpos search_key = btree_iter_search_key(iter); struct bkey_i *next_update; struct bkey_s_c k; int ret; EBUG_ON(iter->path->cached || iter->path->level); bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); while (1) { iter->path = btree_path_set_pos(trans, iter->path, search_key, @@ -2277,24 +2300,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) } if (likely(k.k)) { - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - !bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { - search_key = bkey_successor(iter, k.k->p); - continue; - } - break; } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { /* Advance to next leaf node: */ @@ -2306,6 +2311,92 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) goto out; } } +out: + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; + int ret; + + if (iter->update_path) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { + k = __bch2_btree_iter_peek(iter, search_key); + if (!k.k || bkey_err(k)) + goto out; + + if (iter->update_path && + bkey_cmp(iter->update_path->pos, k.k->p)) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + (iter->flags & BTREE_ITER_INTENT) && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && + !iter->update_path) { + struct bpos pos = k.k->p; + + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } + + pos.snapshot = iter->snapshot; + + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + iter->update_path = iter->path; + + iter->update_path = btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + BUG_ON(!(iter->update_path->nodes_locked & 1)); + iter->update_path->should_be_locked = true; + } + + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; + } + + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } + + break; + } /* * iter->pos should be mononotically increasing, and always be equal to @@ -2316,21 +2407,27 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) - iter->pos.snapshot = iter->snapshot; - iter->path = btree_path_set_pos(trans, iter->path, k.k->p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); BUG_ON(!iter->path->nodes_locked); out: + if (iter->update_path) { + BUG_ON(!(iter->update_path->nodes_locked & 1)); + iter->update_path->should_be_locked = true; + } iter->path->should_be_locked = true; - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + ret = bch2_btree_iter_verify_ret(iter, k); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (unlikely(ret)) { + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + } + + bch2_btree_iter_verify_entry_exit(iter); return k; } @@ -2720,7 +2817,11 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->path) bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_INTENT); + if (iter->update_path) + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); iter->path = NULL; + iter->update_path = NULL; } static void __bch2_trans_iter_init(struct btree_trans *trans, @@ -2750,6 +2851,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, iter->trans = trans; iter->path = NULL; + iter->update_path = NULL; iter->btree_id = btree_id; iter->min_depth = depth; iter->flags = flags; @@ -2798,6 +2900,8 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) *dst = *src; if (src->path) __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + if (src->update_path) + __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); } void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index eceec5d5..5205d53c 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -222,11 +222,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); bool bch2_btree_iter_advance(struct btree_iter *); bool bch2_btree_iter_rewind(struct btree_iter *); -static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) - new_pos.snapshot = iter->snapshot; - iter->k.type = KEY_TYPE_deleted; iter->k.p.inode = iter->pos.inode = new_pos.inode; iter->k.p.offset = iter->pos.offset = new_pos.offset; @@ -234,6 +231,19 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos iter->k.size = 0; } +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + if (unlikely(iter->update_path)) + bch2_path_put(iter->trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + + __bch2_btree_iter_set_pos(iter, new_pos); +} + static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); @@ -295,7 +305,7 @@ static inline int bkey_err(struct bkey_s_c k) return PTR_ERR_OR_ZERO(k.k); } -static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { return flags & BTREE_ITER_SLOTS @@ -316,7 +326,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, struct bkey_s_c k; while (btree_trans_too_many_iters(trans) || - (k = __bch2_btree_iter_peek(iter, flags), + (k = bch2_btree_iter_peek_type(iter, flags), bkey_err(k) == -EINTR)) bch2_trans_begin(trans); @@ -335,7 +345,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) @@ -347,7 +357,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for (; \ - (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 1d7b1012..faed51e7 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -222,7 +222,8 @@ static int btree_key_cache_fill(struct btree_trans *trans, goto err; if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_transaction_restart_ip(trans->fn, _THIS_IP_); + trace_trans_restart_relock_key_cache_fill(trans->fn, + _THIS_IP_, ck_path->btree_id, &ck_path->pos); ret = btree_trans_restart(trans); goto err; } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 914d536c..65f460e3 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -276,6 +276,7 @@ static inline struct btree_path_level *path_l(struct btree_path *path) struct btree_iter { struct btree_trans *trans; struct btree_path *path; + struct btree_path *update_path; enum btree_id btree_id:4; unsigned min_depth:4; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 16ebf1a2..5e5a1b5e 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -73,8 +73,14 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, bool); +int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + +int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); + void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); @@ -135,21 +141,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) - if ((cmp_int(btree_id, i->btree_id) ?: - bpos_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->btree_id) - return i->k; - break; - } - - return NULL; -} - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 47568a0b..7b8ca115 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1938,6 +1938,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_RESERVE| BTREE_INSERT_JOURNAL_RECLAIM| BTREE_INSERT_JOURNAL_RESERVED); if (ret) diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index ca98e685..7186457d 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -828,7 +828,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) struct bch_fs *c = trans->c; int ret; - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || + test_bit(BCH_FS_STARTED, &c->flags)) return -EROFS; bch2_trans_unlock(trans); @@ -844,14 +845,89 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) { struct bkey _deleted = KEY(0, 0, 0); struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; struct bkey_s_c old; struct bkey unpacked; - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + int ret = 0; + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!overwrite) { + if (i->insert_trigger_run) + return 0; + + BUG_ON(i->overwrite_trigger_run); + i->insert_trigger_run = true; + } else { + if (i->overwrite_trigger_run) + return 0; + + BUG_ON(!i->insert_trigger_run); + i->overwrite_trigger_run = true; + } + + old = bch2_btree_path_peek_slot(i->path, &unpacked); + _deleted.p = i->path->pos; + + if (overwrite) { + ret = bch2_trans_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|i->flags); + } else if (old.k->type == i->k->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); + } else { + ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|i->flags); + } + + if (ret == -EINTR) + trace_trans_restart_mark(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + return ret ?: 1; +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 0; overwrite < 2; overwrite++) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + ret = run_one_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; unsigned btree_id = 0; int ret = 0; @@ -867,76 +943,9 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) btree_id_start->btree_id < btree_id) btree_id_start++; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->insert_trigger_run || - (i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - continue; - - BUG_ON(i->overwrite_trigger_run); - - i->insert_trigger_run = true; - trans_trigger_run = true; - - old = bch2_btree_path_peek_slot(i->path, &unpacked); - _deleted.p = i->path->pos; - - if (old.k->type == i->k->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - i->overwrite_trigger_run = true; - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); - } else { - ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), - BTREE_TRIGGER_INSERT|i->flags); - } - - if (ret == -EINTR) - trace_trans_restart_mark(trans->fn, _RET_IP_, - i->btree_id, &i->path->pos); - if (ret) - return ret; - } - } while (trans_trigger_run); - - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->overwrite_trigger_run || - (i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - continue; - - BUG_ON(!i->insert_trigger_run); - - i->overwrite_trigger_run = true; - trans_trigger_run = true; - - old = bch2_btree_path_peek_slot(i->path, &unpacked); - _deleted.p = i->path->pos; - - ret = bch2_trans_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|i->flags); - - if (ret == -EINTR) - trace_trans_restart_mark(trans->fn, _RET_IP_, - i->btree_id, &i->path->pos); - if (ret) - return ret; - } - } while (trans_trigger_run); + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; } trans_for_each_update(trans, i) @@ -1072,6 +1081,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, struct bkey_s_c k; int ret; + if (!btree_type_has_snapshots(id)) + return 0; + if (!snapshot_t(c, pos.snapshot)->children[0]) return 0; @@ -1100,10 +1112,10 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, return ret; } -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_update_flags flags) +int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) { struct bch_fs *c = trans->c; struct btree_iter iter, update_iter; @@ -1261,13 +1273,9 @@ nomerge1: bkey_reassemble(update, k); bch2_cut_front(insert->k.p, update); - bch2_trans_copy_iter(&update_iter, &iter); - update_iter.pos = update->k.p; - ret = bch2_trans_update(trans, &update_iter, update, + ret = bch2_trans_update_by_path(trans, iter.path, update, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| flags); - bch2_trans_iter_exit(trans, &update_iter); - if (ret) goto err; goto out; @@ -1350,26 +1358,23 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, return ret; } -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, +int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, struct bkey_i *k, enum btree_update_flags flags) { struct btree_insert_entry *i, n; - BUG_ON(!iter->path->should_be_locked); - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return bch2_trans_update_extent(trans, iter, k, flags); + BUG_ON(!path->should_be_locked); BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); - BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); + BUG_ON(bpos_cmp(k->k.p, path->pos)); n = (struct btree_insert_entry) { .flags = flags, - .bkey_type = __btree_node_type(iter->path->level, iter->btree_id), - .btree_id = iter->btree_id, - .level = iter->path->level, - .cached = iter->flags & BTREE_ITER_CACHED, - .path = iter->path, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, .k = k, .ip_allocated = _RET_IP_, }; @@ -1380,16 +1385,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter btree_insert_entry_cmp(i - 1, i) >= 0); #endif - if (bkey_deleted(&n.k->k) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - n.k->k.type = KEY_TYPE_whiteout; - } - /* * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: @@ -1420,10 +1415,29 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter i - trans->updates, n); __btree_path_get(n.path, true); - return 0; } +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path, + k, flags); +} + void bch2_trans_commit_hook(struct btree_trans *trans, struct btree_trans_commit_hook *h) { diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 895ff255..bf5ad436 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -11,6 +11,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "ec.h" #include "error.h" #include "inode.h" @@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, } } -/* - * Clear journal_seq_valid for buckets for which it's not needed, to prevent - * wraparound: - */ -void bch2_bucket_seq_cleanup(struct bch_fs *c) -{ - u64 journal_seq = atomic64_read(&c->journal.seq); - u16 last_seq_ondisk = c->journal.flushed_seq_ondisk; - struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; - struct bucket_mark m; - unsigned i; - - if (journal_seq - c->last_bucket_seq_cleanup < - (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) - return; - - c->last_bucket_seq_cleanup = journal_seq; - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) { - bucket_cmpxchg(g, m, ({ - if (!m.journal_seq_valid || - bucket_needs_journal_commit(m, last_seq_ondisk)) - break; - - m.journal_seq_valid = 0; - })); - } - up_read(&ca->bucket_lock); - } -} - void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; @@ -323,8 +287,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m) static inline int bucket_sectors_fragmented(struct bch_dev *ca, struct bucket_mark m) { - return bucket_sectors_used(m) - ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) + return m.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors) : 0; } @@ -570,16 +534,24 @@ static int bch2_mark_alloc(struct btree_trans *trans, v->journal_seq = cpu_to_le64(new_u.journal_seq); } - ca = bch_dev_bkey_exists(c, new.k->p.inode); + if (old_u.data_type && !new_u.data_type && new_u.journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(c, + new_u.dev, new_u.bucket, + new_u.journal_seq); + if (ret) + return ret; + } - if (new.k->p.offset >= ca->mi.nbuckets) + ca = bch_dev_bkey_exists(c, new_u.dev); + + if (new_u.bucket >= ca->mi.nbuckets) return 0; percpu_down_read(&c->mark_lock); if (!gc && new_u.gen != old_u.gen) - *bucket_gen(ca, new.k->p.offset) = new_u.gen; + *bucket_gen(ca, new_u.bucket) = new_u.gen; - g = __bucket(ca, new.k->p.offset, gc); + g = __bucket(ca, new_u.bucket, gc); old_m = bucket_cmpxchg(g, m, ({ m.gen = new_u.gen; @@ -587,11 +559,6 @@ static int bch2_mark_alloc(struct btree_trans *trans, m.dirty_sectors = new_u.dirty_sectors; m.cached_sectors = new_u.cached_sectors; m.stripe = new_u.stripe != 0; - - if (journal_seq) { - m.journal_seq_valid = 1; - m.journal_seq = journal_seq; - } })); bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); @@ -619,7 +586,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, return ret; } - trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), + trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), old_m.cached_sectors); } @@ -767,9 +734,10 @@ static int check_bucket_ref(struct bch_fs *c, static int mark_stripe_bucket(struct btree_trans *trans, struct bkey_s_c k, unsigned ptr_idx, - u64 journal_seq, unsigned flags) + unsigned flags) { struct bch_fs *c = trans->c; + u64 journal_seq = trans->journal_res.seq; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; @@ -810,11 +778,6 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (data_type) new.data_type = data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - new.stripe = true; })); @@ -886,11 +849,6 @@ static int bch2_mark_pointer(struct btree_trans *trans, new.data_type = bucket_data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - if (flags & BTREE_TRIGGER_NOATOMIC) { g->_mark = new; break; @@ -1111,7 +1069,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, memset(m->block_sectors, 0, sizeof(m->block_sectors)); for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); + ret = mark_stripe_bucket(trans, new, i, flags); if (ret) return ret; } @@ -1459,24 +1417,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); - struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); + struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, + POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), + BTREE_ITER_WITH_UPDATES| BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); if (ret) { bch2_trans_iter_exit(trans, iter); return ret; } - *u = update && !bpos_cmp(update->k.p, pos) - ? bch2_alloc_unpack(bkey_i_to_s_c(update)) - : alloc_mem_to_key(c, iter); - + *u = bch2_alloc_unpack(k); return 0; } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 45c6d230..d35c96bc 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -149,23 +149,11 @@ static inline u8 ptr_stale(struct bch_dev *ca, /* bucket gc marks */ -static inline unsigned bucket_sectors_used(struct bucket_mark mark) -{ - return mark.dirty_sectors + mark.cached_sectors; -} - static inline bool is_available_bucket(struct bucket_mark mark) { return !mark.dirty_sectors && !mark.stripe; } -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -} - /* Device usage: */ struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); @@ -240,7 +228,6 @@ bch2_fs_usage_read_short(struct bch_fs *); /* key/bucket marking: */ -void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 18bca269..24139831 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -15,18 +15,9 @@ struct bucket_mark { u8 gen; u8 data_type:3, owned_by_allocator:1, - journal_seq_valid:1, stripe:1; u16 dirty_sectors; u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket can't be - * reused until the journal sequence number written to disk is >= the - * bucket's journal sequence number: - */ - u16 journal_seq; }; }; }; diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c new file mode 100644 index 00000000..33ae6370 --- /dev/null +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets_waiting_for_journal.h" +#include + +static u32 hash_seeds[] = { + 2168153708, + 1262039142, + 1183479835, +}; + +static inline unsigned bucket_hash(u64 dev_bucket, unsigned hash_seed_idx) +{ + return jhash_2words(dev_bucket << 32, dev_bucket, hash_seeds[hash_seed_idx]); +} + +bool bch2_bucket_needs_journal_commit(struct bch_fs *c, + u64 flushed_seq, + unsigned dev, u64 bucket) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + u64 dev_bucket = (u64) dev << 56 | bucket; + bool ret = false; + unsigned i; + + mutex_lock(&b->lock); + BUG_ON(!is_power_of_2(b->nr)); + + for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) { + u32 h = bucket_hash(dev_bucket, i) & (b->nr - 1); + + if (b->d[h].dev_bucket == dev_bucket) { + ret = b->d[h].journal_seq > flushed_seq; + break; + } + } + + mutex_unlock(&b->lock); + + return ret; +} + +static int bch2_buckets_waiting_for_journal_rehash(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + u64 flushed_seq = c->journal.flushed_seq_ondisk; + unsigned i, j, h, new_nr = b->nr * 2, elements = 0; + struct bucket_hashed *new_table; + + new_table = kvmalloc_array(new_nr, sizeof(*new_table), __GFP_ZERO); + if (!new_table) + return -ENOMEM; + + for (i = 0; i < b->nr; i++) { + if (b->d[i].journal_seq < flushed_seq) + continue; + + for (j = 0; j < ARRAY_SIZE(hash_seeds); j++) { + h = bucket_hash(b->d[i].dev_bucket, j); + if ((h & (b->nr - 1)) == i) + break; + } + + BUG_ON(j == ARRAY_SIZE(hash_seeds)); + BUG_ON(new_table[h & (new_nr - 1)].dev_bucket); + + new_table[h & (new_nr - 1)] = b->d[i]; + + elements++; + } + + kvfree(b->d); + b->nr = new_nr; + b->d = new_table; + return 0; +} + +int bch2_set_bucket_needs_journal_commit(struct bch_fs *c, unsigned dev, u64 bucket, + u64 journal_seq) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + struct bucket_hashed new = { + .dev_bucket = (u64) dev << 56 | bucket, + .journal_seq = journal_seq, + }, *last_evicted = NULL; + u64 flushed_seq = c->journal.flushed_seq_ondisk; + unsigned tries, i; + int ret = 0; + + mutex_lock(&b->lock); + BUG_ON(!is_power_of_2(b->nr)); +retry: + for (tries = 0; tries < 5; tries++) { + struct bucket_hashed *old, *victim = NULL; + + for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) { + old = b->d + (bucket_hash(new.dev_bucket, i) & (b->nr - 1)); + + if (old->dev_bucket == new.dev_bucket || + old->journal_seq <= flushed_seq) { + *old = new; + goto out; + } + + if (last_evicted != old) + victim = old; + } + + /* Failed to find an empty slot: */ + swap(new, *victim); + last_evicted = victim; + } + + ret = bch2_buckets_waiting_for_journal_rehash(c); + if (!ret) + goto retry; +out: + mutex_unlock(&b->lock); + + return ret; +} + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + kvfree(b->d); +} + +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + mutex_init(&b->lock); + + b->nr = 8; + b->d = kvmalloc_array(b->nr, sizeof(*b->d), __GFP_ZERO); + if (!b->d) + return -ENOMEM; + + return 0; +} diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h new file mode 100644 index 00000000..079a591c --- /dev/null +++ b/libbcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H +#define _BUCKETS_WAITING_FOR_JOURNAL_H + +#include "buckets_waiting_for_journal_types.h" + +bool bch2_bucket_needs_journal_commit(struct bch_fs *, u64, unsigned, u64); +int bch2_set_bucket_needs_journal_commit(struct bch_fs *, unsigned, u64, u64); + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h new file mode 100644 index 00000000..99d17ffb --- /dev/null +++ b/libbcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H +#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H + +struct bucket_hashed { + u64 dev_bucket; + u64 journal_seq; +}; + +struct buckets_waiting_for_journal { + struct mutex lock; + size_t nr; + struct bucket_hashed *d; +}; + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index ef6da535..3a7c1468 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -585,62 +585,49 @@ found_slot: static int bch2_inode_delete_keys(struct btree_trans *trans, subvol_inum inum, enum btree_id id) { - u64 offset = 0; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; int ret = 0; - while (!ret || ret == -EINTR) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i delete; - u32 snapshot; + /* + * We're never going to be deleting extents, no need to use an extent + * iterator: + */ + bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + while (1) { bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - continue; + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); - bch2_trans_iter_init(trans, &iter, id, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_INTENT); k = bch2_btree_iter_peek(&iter); - - if (!k.k || iter.pos.inode != inum.inum) { - bch2_trans_iter_exit(trans, &iter); - break; - } - ret = bkey_err(k); if (ret) goto err; + if (!k.k || iter.pos.inode != inum.inum) + break; + bkey_init(&delete.k); delete.k.p = iter.pos; - if (btree_node_type_is_extents(iter.btree_id)) { - unsigned max_sectors = - min_t(u64, U64_MAX - iter.pos.offset, - KEY_SIZE_MAX & (~0 << trans->c->block_bits)); - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - - ret = bch2_extent_trim_atomic(trans, &iter, &delete); - if (ret) - goto err; - } - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: - bch2_trans_commit(trans, &disk_res, NULL, + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); - bch2_disk_reservation_put(trans->c, &disk_res); err: - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); + if (ret && ret != -EINTR) + break; } + bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index df4d1a7a..e566f851 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1671,13 +1671,9 @@ retry_alloc: } } - bch2_bucket_seq_cleanup(c); - continue_at(cl, do_journal_write, c->io_complete_wq); return; no_io: - bch2_bucket_seq_cleanup(c); - continue_at(cl, journal_write_done, c->io_complete_wq); return; err: diff --git a/libbcachefs/move.c b/libbcachefs/move.c index f73be9cb..3e3dcec3 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -700,17 +700,20 @@ static int __bch2_move_data(struct bch_fs *c, bch2_trans_begin(&trans); k = bch2_btree_iter_peek(&iter); - - stats->pos = iter.pos; - if (!k.k) break; + ret = bkey_err(k); + if (ret == -EINTR) + continue; if (ret) break; + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; + stats->pos = iter.pos; + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; @@ -753,10 +756,8 @@ static int __bch2_move_data(struct bch_fs *c, ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { - if (ret2 == -EINTR) { - bch2_trans_begin(&trans); + if (ret2 == -EINTR) continue; - } if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 7cd1b0cf..92f78907 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -69,10 +69,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, .dev = p.ptr.dev, .offset = p.ptr.offset, }; + ssize_t i; - ssize_t i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); + if (p.ptr.cached) + continue; + + i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); #if 0 /* eytzinger search verify code: */ ssize_t j = -1, k; @@ -185,8 +189,7 @@ static int bch2_copygc(struct bch_fs *c) if (m.owned_by_allocator || m.data_type != BCH_DATA_user || - !bucket_sectors_used(m) || - bucket_sectors_used(m) >= ca->mi.bucket_size) + m.dirty_sectors >= ca->mi.bucket_size) continue; WARN_ON(m.stripe && !g->stripe_redundancy); @@ -195,9 +198,9 @@ static int bch2_copygc(struct bch_fs *c) .dev = dev_idx, .gen = m.gen, .replicas = 1 + g->stripe_redundancy, - .fragmentation = bucket_sectors_used(m) * (1U << 15) + .fragmentation = m.dirty_sectors * (1U << 15) / ca->mi.bucket_size, - .sectors = bucket_sectors_used(m), + .sectors = m.dirty_sectors, .offset = bucket_to_sector(ca, b), }; heap_add_or_replace(h, e, -fragmentation_cmp, NULL); @@ -231,8 +234,11 @@ static int bch2_copygc(struct bch_fs *c) buckets_to_move = h->used; - if (!buckets_to_move) + if (!buckets_to_move) { + bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", + sectors_reserved); return 0; + } eytzinger0_sort(h->data, h->used, sizeof(h->data[0]), @@ -260,8 +266,8 @@ static int bch2_copygc(struct bch_fs *c) m = READ_ONCE(buckets->b[b].mark); if (i->gen == m.gen && - bucket_sectors_used(m)) { - sectors_not_moved += bucket_sectors_used(m); + m.dirty_sectors) { + sectors_not_moved += m.dirty_sectors; buckets_not_moved++; } } diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 8aeb2e41..69603327 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -456,10 +456,10 @@ err: return ret; } -static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) { struct btree_iter iter; struct bkey_i_snapshot *n; @@ -522,7 +522,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, n = bch2_trans_kmalloc(trans, sizeof(*n)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto err; bkey_reassemble(&n->k_i, k); diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index e4c3fdcd..4abe53df 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -122,6 +122,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); +/* only exported for tests: */ +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + int bch2_subvolume_delete(struct btree_trans *, u32); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 577b58e4..586ba60d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -16,6 +16,7 @@ #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" +#include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" @@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); @@ -810,6 +812,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c); bch2_fs_subvolumes_init(c) ?: bch2_fs_io_init(c) ?: bch2_fs_encryption_init(c) ?: diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 6d159632..ed9a0950 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -192,7 +192,7 @@ read_attribute(new_stripes); read_attribute(io_timers_read); read_attribute(io_timers_write); -read_attribute(data_op_data_progress); +read_attribute(data_jobs); #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); @@ -230,32 +230,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c) return nr ? div64_u64(sectors, nr) : 0; } -static long stats_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_move_stats *stats) -{ - pr_buf(out, "%s: data type %s btree_id %s position: ", - stats->name, - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); - pr_buf(out, "%s", "\n"); - - return 0; -} - static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) { long ret = 0; - struct bch_move_stats *iter; + struct bch_move_stats *stats; mutex_lock(&c->data_progress_lock); - - if (list_empty(&c->data_progress_list)) - pr_buf(out, "%s", "no progress to report\n"); - else - list_for_each_entry(iter, &c->data_progress_list, list) { - stats_to_text(out, c, iter); - } + list_for_each_entry(stats, &c->data_progress_list, list) { + pr_buf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + pr_buf(out, "%s", "\n"); + } mutex_unlock(&c->data_progress_lock); return ret; @@ -463,7 +451,7 @@ SHOW(bch2_fs) return out.pos - buf; } - if (attr == &sysfs_data_op_data_progress) { + if (attr == &sysfs_data_jobs) { data_progress_to_text(&out, c); return out.pos - buf; } @@ -616,7 +604,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), - &sysfs_data_op_data_progress, + &sysfs_data_jobs, &sysfs_internal_uuid, NULL diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 16d67eb6..de84ce83 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "btree_update.h" #include "journal_reclaim.h" +#include "subvolume.h" #include "tests.h" #include "linux/kthread.h" @@ -461,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) __test_extent_overwrite(c, 32, 64, 32, 128); } +/* snapshot unit tests */ + +/* Test skipping over keys in unrelated snapshots: */ +static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie cookie; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = snapid_hi; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); + k = bch2_btree_iter_peek(&iter); + + BUG_ON(k.k->p.snapshot != U32_MAX); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_snapshots(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie cookie; + u32 snapids[2]; + u32 snapid_subvols[2] = { 1, 1 }; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = U32_MAX; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_snapshot_node_create(&trans, U32_MAX, + snapids, + snapid_subvols, + 2)); + if (ret) + return ret; + + if (snapids[0] > snapids[1]) + swap(snapids[0], snapids[1]); + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + if (ret) { + bch_err(c, "err %i from test_snapshot_filter", ret); + return ret; + } + + return 0; +} + /* perf tests */ static u64 test_rand(void) @@ -789,8 +854,10 @@ static int btree_perf_test_thread(void *data) } ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); - if (ret) + if (ret) { + bch_err(j->c, "%ps: error %i", j->fn, ret); j->ret = ret; + } if (atomic_dec_and_test(&j->done)) { j->finish = sched_clock(); @@ -843,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, perf_test(test_extent_overwrite_middle); perf_test(test_extent_overwrite_all); + perf_test(test_snapshots); + if (!j.fn) { pr_err("unknown test %s", testname); return -EINVAL;