diff --git a/.bcachefs_revision b/.bcachefs_revision index 0476cc0e..f7786253 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -6628827a87075d3f807c974045ed293ac1e8965b +b9bd69421f7364ca4ff11c827fd0e171a8b826ea diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index c74b7376..84741316 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -191,8 +191,8 @@ void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, size_t, size_t); /** - * genradix_iter_peek - get first entry at or below iterator's current - * position + * genradix_iter_peek_prev - get first entry at or below iterator's current + * position * @_iter: a genradix_iter * @_radix: genradix being iterated over * diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index c342ec3b..bcfae916 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -2085,6 +2085,17 @@ void bch2_recalc_capacity(struct bch_fs *c) closure_wake_up(&c->freelist_wait); } +u64 bch2_min_rw_member_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + u64 ret = U64_MAX; + + for_each_rw_member(ca, c, i) + ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + return ret; +} + static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index e1ce38ef..73faf99a 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -249,6 +249,7 @@ int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *); +u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 3bc4abd3..b85c7765 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter iter, citer; + struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; + + /* + * Scan with an uncached iterator to avoid polluting the key cache. An + * uncached iter will return a cached key if one exists, but if not + * there is no other underlying protection for the associated key cache + * slot. To avoid racing bucket allocations, look up the cached key slot + * of any likely allocation candidate before attempting to proceed with + * the allocation. This provides proper exclusion on the associated + * bucket. + */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { @@ -419,25 +430,38 @@ again: continue; a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) continue; + /* now check the cached key to serialize concurrent allocs of the bucket */ + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); + if (a->data_type != BCH_DATA_free) + goto next; + s->buckets_seen++; ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); +next: + citer.path->preserve = false; + bch2_trans_iter_exit(trans, &citer); if (ob) break; } bch2_trans_iter_exit(trans, &iter); + alloc_cursor = iter.pos.offset; ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); - if (!ob && alloc_cursor > alloc_start) { - alloc_cursor = alloc_start; + if (!ob && alloc_start > first_bucket) { + alloc_cursor = alloc_start = first_bucket; goto again; } diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 29b000c6..7a1c2440 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -824,34 +824,30 @@ enum inode_opt_id { Inode_opt_nr, }; -enum { - /* - * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL - * flags) - */ - __BCH_INODE_SYNC = 0, - __BCH_INODE_IMMUTABLE = 1, - __BCH_INODE_APPEND = 2, - __BCH_INODE_NODUMP = 3, - __BCH_INODE_NOATIME = 4, +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) - __BCH_INODE_I_SIZE_DIRTY = 5, /* obsolete */ - __BCH_INODE_I_SECTORS_DIRTY = 6, /* obsolete */ - __BCH_INODE_UNLINKED = 7, - __BCH_INODE_BACKPTR_UNTRUSTED = 8, +/* bits 20+ reserved for packed fields below: */ - /* bits 20+ reserved for packed fields below: */ +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x }; -#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); @@ -1617,9 +1613,7 @@ struct journal_seq_blacklist_entry { struct bch_sb_field_journal_seq_blacklist { struct bch_sb_field field; - - struct journal_seq_blacklist_entry start[0]; - __u64 _data[]; + struct journal_seq_blacklist_entry start[]; }; struct bch_sb_field_errors { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index feba9a31..ba392eb0 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1109,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, if (unlikely(ret)) goto out; + if (unlikely(!trans->srcu_held)) + bch2_trans_srcu_lock(trans); + /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: @@ -2830,18 +2833,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) return p; } -static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) +static inline void check_srcu_held_too_long(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct btree_path *path; + WARN(time_after(jiffies, trans->srcu_lock_time + HZ * 10), + "btree trans held srcu lock (delaying memory reclaim) by more than 10 seconds"); +} - trans_for_each_path(trans, path) - if (path->cached && !btree_node_locked(path, 0)) - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); +void bch2_trans_srcu_unlock(struct btree_trans *trans) +{ + if (trans->srcu_held) { + struct bch_fs *c = trans->c; + struct btree_path *path; - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; + trans_for_each_path(trans, path) + if (path->cached && !btree_node_locked(path, 0)) + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); + + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + trans->srcu_held = false; + } +} + +void bch2_trans_srcu_lock(struct btree_trans *trans) +{ + if (!trans->srcu_held) { + trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; + } } /** @@ -2895,8 +2915,9 @@ u32 bch2_trans_begin(struct btree_trans *trans) } trans->last_begin_time = now; - if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) - bch2_trans_reset_srcu_lock(trans); + if (unlikely(trans->srcu_held && + time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) + bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; if (trans->restarted) { @@ -2983,8 +3004,9 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->wb_updates_size = s->wb_updates_size; } - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; + trans->srcu_held = true; if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { struct btree_trans *pos; @@ -3061,7 +3083,10 @@ void bch2_trans_put(struct btree_trans *trans) check_btree_paths_leaked(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + if (trans->srcu_held) { + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + } bch2_journal_preres_put(&c->journal, &trans->journal_preres); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 70759ee3..5e103f51 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -274,6 +274,7 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); +void bch2_trans_unlock_long(struct btree_trans *); bool bch2_trans_locked(struct btree_trans *); static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) @@ -579,6 +580,9 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ KEY_TYPE_##_type, sizeof(*_val), _val) +void bch2_trans_srcu_unlock(struct btree_trans *); +void bch2_trans_srcu_lock(struct btree_trans *); + u32 bch2_trans_begin(struct btree_trans *); /* diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index ba263302..c4266835 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -759,6 +759,12 @@ void bch2_trans_unlock(struct btree_trans *trans) bch2_assert_btree_nodes_not_locked(); } +void bch2_trans_unlock_long(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + bch2_trans_srcu_unlock(trans); +} + bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 8140b6e6..32693f7c 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -681,7 +681,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, BCH_JSET_ENTRY_overwrite, i->btree_id, i->level, i->old_k.u64s); - bkey_reassemble(&entry->start[0], + bkey_reassemble((struct bkey_i *) entry->start, (struct bkey_s_c) { &i->old_k, i->old_v }); } @@ -689,7 +689,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, BCH_JSET_ENTRY_btree_keys, i->btree_id, i->level, i->k->k.u64s); - bkey_copy(&entry->start[0], i->k); + bkey_copy((struct bkey_i *) entry->start, i->k); } trans_for_each_wb_update(trans, wb) { @@ -697,7 +697,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, BCH_JSET_ENTRY_btree_keys, wb->btree, 0, wb->k.k.u64s); - bkey_copy(&entry->start[0], &wb->k); + bkey_copy((struct bkey_i *) entry->start, &wb->k); } if (trans->journal_seq) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index cbcb04a4..4b9cc61a 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -432,6 +432,7 @@ struct btree_trans { u8 nr_updates; u8 nr_wb_updates; u8 wb_updates_size; + bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d029e034..89ada89e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2411,7 +2411,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry r->level = entry->level; r->alive = true; - bkey_copy(&r->key, &entry->start[0]); + bkey_copy(&r->key, (struct bkey_i *) entry->start); mutex_unlock(&c->btree_root_lock); } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 5e0a467f..d92b3cf5 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -271,7 +271,7 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); + __bch_btree_u64s_remaining(c, b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 114f86b4..87b4b2d1 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -69,9 +69,15 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, _ret; \ }) +#define darray_remove_item(_d, _pos) \ + array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) + #define darray_for_each(_d, _i) \ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) +#define darray_for_each_reverse(_d, _i) \ + for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + #define darray_init(_d) \ do { \ (_d)->data = NULL; \ diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index bb530544..4496cf91 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -51,7 +51,7 @@ int bch2_create_trans(struct btree_trans *trans, bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_UNLINKED; + new_inode->bi_flags |= BCH_INODE_unlinked; ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); if (ret) diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 6040bd3f..5a39bcb5 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -45,13 +45,13 @@ static int bch2_inode_flags_set(struct btree_trans *trans, unsigned newflags = s->flags; unsigned oldflags = bi->bi_flags & s->mask; - if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && + if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; if (!S_ISREG(bi->bi_mode) && !S_ISDIR(bi->bi_mode) && - (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) + (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) return -EINVAL; if (s->set_projinherit) { diff --git a/libbcachefs/fs-ioctl.h b/libbcachefs/fs-ioctl.h index 54a9c21a..d30f9bb0 100644 --- a/libbcachefs/fs-ioctl.h +++ b/libbcachefs/fs-ioctl.h @@ -6,28 +6,28 @@ /* bcachefs inode flags -> vfs inode flags: */ static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, - [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, - [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, - [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, + [__BCH_INODE_sync] = FS_XFLAG_SYNC, + [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_append] = FS_XFLAG_APPEND, + [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, + [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; }; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 89759e6a..693f3474 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -66,11 +66,11 @@ void bch2_inode_update_after_write(struct btree_trans *trans, inode->v.i_mode = bi->bi_mode; if (fields & ATTR_ATIME) - inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); + inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); if (fields & ATTR_MTIME) - inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); + inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); if (fields & ATTR_CTIME) - inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); + inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); inode->ei_inode = *bi; @@ -753,9 +753,9 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->gid = inode->v.i_gid; stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); - stat->atime = inode->v.i_atime; - stat->mtime = inode->v.i_mtime; - stat->ctime = inode->v.i_ctime; + stat->atime = inode_get_atime(&inode->v); + stat->mtime = inode_get_mtime(&inode->v); + stat->ctime = inode_get_ctime(&inode->v); stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; @@ -764,15 +764,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); } - if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) + if (inode->ei_inode.bi_flags & BCH_INODE_immutable) stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= STATX_ATTR_IMMUTABLE; - if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) + if (inode->ei_inode.bi_flags & BCH_INODE_append) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= STATX_ATTR_APPEND; - if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) + if (inode->ei_inode.bi_flags & BCH_INODE_nodump) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_NODUMP; @@ -1418,9 +1418,9 @@ static int inode_update_times_fn(struct btree_trans *trans, { struct bch_fs *c = inode->v.i_sb->s_fs_info; - bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); - bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); - bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); + bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); + bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); + bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); return 0; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 0e470ebd..9f3e9bd3 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -854,9 +854,9 @@ static int check_inode(struct btree_trans *trans, BUG_ON(bch2_inode_unpack(k, &u)); if (!full && - !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY| - BCH_INODE_UNLINKED))) + !(u.bi_flags & (BCH_INODE_i_size_dirty| + BCH_INODE_i_sectors_dirty| + BCH_INODE_unlinked))) return 0; if (prev->bi_inum != u.bi_inum) @@ -870,7 +870,7 @@ static int check_inode(struct btree_trans *trans, return -EINVAL; } - if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && + if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { struct bpos new_min_pos; @@ -878,7 +878,7 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; - u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; + u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; ret = __write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); @@ -890,7 +890,7 @@ static int check_inode(struct btree_trans *trans, return 0; } - if (u.bi_flags & BCH_INODE_UNLINKED && + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || fsck_err(c, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", @@ -903,7 +903,7 @@ static int check_inode(struct btree_trans *trans, return ret; } - if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && + if (u.bi_flags & BCH_INODE_i_size_dirty && (!c->sb.clean || fsck_err(c, inode_i_size_dirty_but_clean, "filesystem marked clean, but inode %llu has i_size dirty", @@ -930,13 +930,13 @@ static int check_inode(struct btree_trans *trans, * We truncated without our normal sector accounting hook, just * make sure we recalculate it: */ - u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; + u.bi_flags |= BCH_INODE_i_sectors_dirty; - u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + u.bi_flags &= ~BCH_INODE_i_size_dirty; do_update = true; } - if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && + if (u.bi_flags & BCH_INODE_i_sectors_dirty && (!c->sb.clean || fsck_err(c, inode_i_sectors_dirty_but_clean, "filesystem marked clean, but inode %llu has i_sectors dirty", @@ -953,14 +953,14 @@ static int check_inode(struct btree_trans *trans, } u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + u.bi_flags &= ~BCH_INODE_i_sectors_dirty; do_update = true; } - if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { + if (u.bi_flags & BCH_INODE_backptr_untrusted) { u.bi_dir = 0; u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; + u.bi_flags &= ~BCH_INODE_backptr_untrusted; do_update = true; } @@ -1065,7 +1065,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) return -BCH_ERR_internal_fsck_err; } - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), c, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", w->last_pos.inode, i->snapshot, @@ -1405,7 +1405,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, continue; if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), c, extent_past_end_of_inode, @@ -1588,7 +1588,7 @@ static int check_dirent_target(struct btree_trans *trans, "inode %llu type %s has multiple links but i_nlink 0", target->bi_inum, bch2_d_types[d.v->d_type])) { target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_UNLINKED; + target->bi_flags &= ~BCH_INODE_unlinked; ret = __write_inode(trans, target, target_snapshot); if (ret) @@ -2160,7 +2160,7 @@ int bch2_check_directory_structure(struct bch_fs *c) break; } - if (u.bi_flags & BCH_INODE_UNLINKED) + if (u.bi_flags & BCH_INODE_unlinked) continue; ret = check_path(trans, &path, &u, iter.pos.snapshot); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 925d1b7f..8b3c675e 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -20,13 +20,18 @@ #include -const char * const bch2_inode_opts[] = { #define x(name, ...) #name, +const char * const bch2_inode_opts[] = { BCH_INODE_OPTS() -#undef x NULL, }; +static const char * const bch2_inode_flag_strs[] = { + BCH_INODE_FLAGS() + NULL +}; +#undef x + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -425,7 +430,7 @@ static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct prin inode_compression_type_invalid, "invalid compression opt %u", unpacked.bi_compression - 1); - bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_UNLINKED) && + bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && unpacked.bi_nlink != 0, c, err, inode_unlinked_but_nlink_nonzero, "flagged as unlinked but bi_nlink != 0"); @@ -499,15 +504,20 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", - inode->bi_mode, inode->bi_flags, + prt_printf(out, "mode=%o ", inode->bi_mode); + + prt_str(out, "flags="); + prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); + prt_printf(out, " (%x)", inode->bi_flags); + + prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", inode->bi_journal_seq, inode->bi_size, inode->bi_sectors, inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name " %llu", (u64) inode->_name); + prt_printf(out, " "#_name "=%llu", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x } @@ -546,7 +556,7 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) static inline bool bkey_is_deleted_inode(struct bkey_s_c k) { - return bkey_inode_flags(k) & BCH_INODE_UNLINKED; + return bkey_inode_flags(k) & BCH_INODE_unlinked; } int bch2_trans_mark_inode(struct btree_trans *trans, @@ -927,8 +937,8 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; + if (bi->bi_flags & BCH_INODE_unlinked) + bi->bi_flags &= ~BCH_INODE_unlinked; else { if (bi->bi_nlink == U32_MAX) return -EINVAL; @@ -941,13 +951,13 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) { - if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", bi->bi_inum); return; } - if (bi->bi_flags & BCH_INODE_UNLINKED) { + if (bi->bi_flags & BCH_INODE_unlinked) { bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); return; } @@ -955,7 +965,7 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked * if (bi->bi_nlink) bi->bi_nlink--; else - bi->bi_flags |= BCH_INODE_UNLINKED; + bi->bi_flags |= BCH_INODE_unlinked; } struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) @@ -1094,7 +1104,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) pos.offset, pos.snapshot)) goto delete; - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 74c62e6c..5068ba9c 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -177,7 +177,7 @@ static inline unsigned nlink_bias(umode_t mode) static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) { - return bi->bi_flags & BCH_INODE_UNLINKED + return bi->bi_flags & BCH_INODE_unlinked ? 0 : bi->bi_nlink + nlink_bias(bi->bi_mode); } @@ -187,10 +187,10 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, { if (nlink) { bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); - bi->bi_flags &= ~BCH_INODE_UNLINKED; + bi->bi_flags &= ~BCH_INODE_unlinked; } else { bi->bi_nlink = 0; - bi->bi_flags |= BCH_INODE_UNLINKED; + bi->bi_flags |= BCH_INODE_unlinked; } } diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 613f3843..fbfc42ff 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -223,7 +223,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, inode = bkey_i_to_inode_v3(k); - if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && new_i_size > le64_to_cpu(inode->v.bi_size)) { inode->v.bi_size = cpu_to_le64(new_i_size); inode_update_flags = 0; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 1b15b010..ab749bf2 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -147,9 +147,8 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) { struct moving_io *io; - bch2_trans_unlock(ctxt->trans); - while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { + bch2_trans_unlock_long(ctxt->trans); list_del(&io->read_list); move_write(io); } @@ -485,8 +484,8 @@ int bch2_move_ratelimit(struct moving_context *ctxt) struct bch_fs *c = ctxt->trans->c; u64 delay; - if (ctxt->wait_on_copygc) { - bch2_trans_unlock(ctxt->trans); + if (ctxt->wait_on_copygc && !c->copygc_running) { + bch2_trans_unlock_long(ctxt->trans); wait_event_killable(c->copygc_running_wq, !c->copygc_running || kthread_should_stop()); @@ -495,8 +494,12 @@ int bch2_move_ratelimit(struct moving_context *ctxt) do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + if (delay) { - bch2_trans_unlock(ctxt->trans); + if (delay > HZ / 10) + bch2_trans_unlock_long(ctxt->trans); + else + bch2_trans_unlock(ctxt->trans); set_current_state(TASK_INTERRUPTIBLE); } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 1b1e8678..07cf9d42 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -45,6 +45,7 @@ do { \ \ if (_cond) \ break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ __wait_event((_ctxt)->wait, \ bch2_moving_ctxt_next_pending_write(_ctxt) || \ (cond_finished = (_cond))); \ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index f73b9b7f..0158c7aa 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -128,7 +128,7 @@ static void move_buckets_wait(struct moving_context *ctxt, kfree(i); } - bch2_trans_unlock(ctxt->trans); + bch2_trans_unlock_long(ctxt->trans); } static bool bucket_in_flight(struct buckets_in_flight *list, @@ -188,7 +188,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, noinline static int bch2_copygc(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight) + struct buckets_in_flight *buckets_in_flight, + bool *did_work) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -224,6 +225,8 @@ static int bch2_copygc(struct moving_context *ctxt, f->bucket.k.gen, data_opts); if (ret) goto err; + + *did_work = true; } err: darray_exit(&buckets); @@ -302,14 +305,16 @@ static int bch2_copygc_thread(void *arg) struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight buckets; + struct buckets_in_flight *buckets; u64 last, wait; int ret = 0; - memset(&buckets, 0, sizeof(buckets)); - - ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); + buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); + if (!buckets) + return -ENOMEM; + ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); if (ret) { + kfree(buckets); bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } @@ -322,16 +327,18 @@ static int bch2_copygc_thread(void *arg) false); while (!ret && !kthread_should_stop()) { - bch2_trans_unlock(ctxt.trans); + bool did_work = false; + + bch2_trans_unlock_long(ctxt.trans); cond_resched(); if (!c->copy_gc_enabled) { - move_buckets_wait(&ctxt, &buckets, true); + move_buckets_wait(&ctxt, buckets, true); kthread_wait_freezable(c->copy_gc_enabled); } if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, &buckets, true); + move_buckets_wait(&ctxt, buckets, true); __refrigerator(false); continue; } @@ -342,7 +349,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, &buckets, true); + move_buckets_wait(&ctxt, buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -352,14 +359,26 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&ctxt, &buckets); + ret = bch2_copygc(&ctxt, buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); + + if (!wait && !did_work) { + u64 min_member_capacity = bch2_min_rw_member_capacity(c); + + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), + MAX_SCHEDULE_TIMEOUT); + } } - move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(&buckets.table); + move_buckets_wait(&ctxt, buckets, true); + + rhashtable_destroy(&buckets->table); + kfree(buckets); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 6ee4d2e0..3319190b 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" @@ -282,15 +283,12 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) static void rebalance_wait(struct bch_fs *c) { struct bch_fs_rebalance *r = &c->rebalance; - struct bch_dev *ca; struct io_clock *clock = &c->io_clock[WRITE]; u64 now = atomic64_read(&clock->now); - u64 min_member_capacity = 128 * 2048; - unsigned i; + u64 min_member_capacity = bch2_min_rw_member_capacity(c); - for_each_rw_member(ca, c, i) - min_member_capacity = min(min_member_capacity, - ca->mi.nbuckets * ca->mi.bucket_size); + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; r->wait_iotime_end = now + (min_member_capacity >> 6); @@ -350,7 +348,7 @@ static int do_rebalance(struct moving_context *ctxt) !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && !atomic64_read(&r->scan_stats.sectors_seen)) { - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); rebalance_wait(c); } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index f73338f3..9600b808 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -226,7 +226,7 @@ static int journal_replay_entry_early(struct bch_fs *c, if (entry->u64s) { r->level = entry->level; - bkey_copy(&r->key, &entry->start[0]); + bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { r->error = -EIO; diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index eb31df60..6e1bfe9f 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -255,7 +255,7 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos dst_end = dst_start, src_end = src_start; struct bch_io_opts opts; struct bpos src_want; - u64 dst_done; + u64 dst_done = 0; u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c index 3d66f15a..9215d414 100644 --- a/libbcachefs/sb-errors.c +++ b/libbcachefs/sb-errors.c @@ -61,7 +61,6 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_errors *e = field_to_type(f, errors); unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - u64 now = ktime_get_real_seconds(); if (out->nr_tabstops <= 1) printbuf_tabstop_push(out, 16); @@ -71,9 +70,7 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, prt_tab(out); prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); prt_tab(out); - bch2_pr_time_units(out, (now - le64_to_cpu(e->entries[i].last_error_time)) * - NSEC_PER_SEC); - prt_str(out, " ago"); + bch2_prt_date_seconds(out, le64_to_cpu(e->entries[i].last_error_time)); prt_newline(out); } } diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index ab5de12e..6a7e20de 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -21,19 +21,14 @@ char * const bch2_member_error_strs[] = { /* Code for bch_sb_field_members_v1: */ -static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i) -{ - return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); -} - struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) { - return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); + return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); } static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) { - struct bch_member ret, *p = members_v2_get_mut(mi, i); + struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); memset(&ret, 0, sizeof(ret)); memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); return ret; @@ -75,7 +70,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); - memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); + memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); memset(dst + le16_to_cpu(mi->member_bytes), 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); } @@ -118,7 +113,7 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) - memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); + memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); return 0; } @@ -235,7 +230,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Last mount:"); prt_tab(out); if (m.last_mount) - pr_time(out, le64_to_cpu(m.last_mount)); + bch2_prt_date_seconds(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); @@ -332,7 +327,7 @@ static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) - + size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - (void *) mi; if (mi_bytes > vstruct_bytes(&mi->field)) { @@ -363,7 +358,7 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) rcu_read_lock(); for_each_member_device_rcu(ca, c, i, NULL) { - struct bch_member *m = members_v2_get_mut(mi, i); + struct bch_member *m = __bch2_members_v2_get_mut(mi, i); for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index 1583e80a..03613e3e 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -4,6 +4,12 @@ extern char * const bch2_member_error_strs[]; +static inline struct bch_member * +__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) +{ + return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); +} + int bch2_sb_members_v2_init(struct bch_fs *c); int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); @@ -186,11 +192,10 @@ static inline bool bch2_member_exists(struct bch_member *m) return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); } -static inline bool bch2_dev_exists(struct bch_sb *sb, - unsigned dev) +static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) { if (dev < sb->nr_devices) { - struct bch_member m = bch2_sb_member_get(sb, dev); + struct bch_member m = bch2_sb_member_get(sb, dev); return bch2_member_exists(&m); } return false; diff --git a/libbcachefs/six.c b/libbcachefs/six.c index 458a1de0..d22826ca 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -323,99 +323,55 @@ EXPORT_SYMBOL_GPL(six_relock_ip); #ifdef CONFIG_LOCK_SPIN_ON_OWNER -static inline bool six_can_spin_on_owner(struct six_lock *lock) +static inline bool six_owner_running(struct six_lock *lock) { - struct task_struct *owner; - bool ret; - - if (need_resched()) - return false; - + /* + * When there's no owner, we might have preempted between the owner + * acquiring the lock and setting the owner field. If we're an RT task + * that will live-lock because we won't let the owner complete. + */ rcu_read_lock(); - owner = READ_ONCE(lock->owner); - ret = !owner || owner_on_cpu(owner); + struct task_struct *owner = READ_ONCE(lock->owner); + bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); rcu_read_unlock(); return ret; } -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner, - u64 end_time) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { - bool ret = true; unsigned loop = 0; - - rcu_read_lock(); - while (lock->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - if (!owner_on_cpu(owner) || need_resched()) { - ret = false; - break; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; u64 end_time; if (type == SIX_LOCK_write) return false; + if (lock->wait_list.next != &wait->list) + return false; + + if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) + return false; + preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; - - if (!osq_lock(&lock->osq)) - goto fail; - end_time = sched_clock() + 10 * NSEC_PER_USEC; - while (1) { - struct task_struct *owner; - + while (!need_resched() && six_owner_running(lock)) { /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner, end_time)) - break; - - if (do_six_trylock(lock, type, false)) { - osq_unlock(&lock->osq); + if (smp_load_acquire(&wait->lock_acquired)) { preempt_enable(); return true; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); break; + } /* * The cpu_relax() call is a compiler barrier which forces @@ -426,24 +382,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type cpu_relax(); } - osq_unlock(&lock->osq); -fail: preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - return false; } #else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { return false; } @@ -467,9 +414,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, trace_contention_begin(lock, 0); lock_contended(&lock->dep_map, ip); - if (six_optimistic_spin(lock, type)) - goto out; - wait->task = current; wait->lock_want = type; wait->lock_acquired = false; @@ -507,6 +451,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, ret = 0; } + if (six_optimistic_spin(lock, wait, type)) + goto out; + while (1) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/libbcachefs/six.h b/libbcachefs/six.h index 394da423..a7104ac1 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -124,7 +124,6 @@ */ #include -#include #include #include @@ -140,7 +139,6 @@ struct six_lock { unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; - struct optimistic_spin_queue osq; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 83bdb436..a93e53d0 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1183,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Created:"); prt_tab(out); if (sb->time_base_lo) - pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + bch2_prt_date_seconds(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 1b5c2a1b..24672bb3 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1885,9 +1885,9 @@ found: struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { - struct bch_sb_handle *sb = NULL; + DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; - unsigned i, best_sb = 0; + struct bch_sb_handle *sb, *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; @@ -1899,49 +1899,46 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } - sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); - if (!sb) { - ret = -ENOMEM; + ret = darray_make_room(&sbs, nr_devices); + if (ret) goto err; - } - for (i = 0; i < nr_devices; i++) { - ret = bch2_read_super(devices[i], &opts, &sb[i]); + for (unsigned i = 0; i < nr_devices; i++) { + struct bch_sb_handle sb = { NULL }; + + ret = bch2_read_super(devices[i], &opts, &sb); if (ret) goto err; + BUG_ON(darray_push(&sbs, sb)); } - for (i = 1; i < nr_devices; i++) - if (le64_to_cpu(sb[i].sb->seq) > - le64_to_cpu(sb[best_sb].sb->seq)) - best_sb = i; + darray_for_each(sbs, sb) + if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + best = sb; - i = 0; - while (i < nr_devices) { - if (i != best_sb && - !bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb[i].bdev); - bch2_free_super(&sb[i]); - array_remove_item(sb, nr_devices, i); + darray_for_each_reverse(sbs, sb) { + if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { + pr_info("%pg has been removed, skipping", sb->bdev); + bch2_free_super(sb); + darray_remove_item(&sbs, sb); + best -= best > sb; continue; } - ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; - i++; } - c = bch2_fs_alloc(sb[best_sb].sb, opts); - if (IS_ERR(c)) { - ret = PTR_ERR(c); + c = bch2_fs_alloc(best->sb, opts); + ret = PTR_ERR_OR_ZERO(c); + if (ret) goto err; - } down_write(&c->state_lock); - for (i = 0; i < nr_devices; i++) { - ret = bch2_dev_attach_bdev(c, &sb[i]); + darray_for_each(sbs, sb) { + ret = bch2_dev_attach_bdev(c, sb); if (ret) { up_write(&c->state_lock); goto err; @@ -1960,7 +1957,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } out: - kfree(sb); + darray_for_each(sbs, sb) + bch2_free_super(sb); + darray_exit(&sbs); printbuf_exit(&errbuf); module_put(THIS_MODULE); return c; @@ -1970,9 +1969,6 @@ err_print: err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); - if (sb) - for (i = 0; i < nr_devices; i++) - bch2_free_super(&sb[i]); c = ERR_PTR(ret); goto out; } diff --git a/libbcachefs/util.c b/libbcachefs/util.c index adeec805..7ba5df4e 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -467,6 +467,24 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) prt_printf(out, "%s", u->name); } +#ifndef __KERNEL__ +#include +void bch2_prt_date_seconds(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + prt_str(out, buf); +} +#else +void bch2_prt_date_seconds(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + #define TABSTOP_SIZE 12 static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 67f1a1d2..0595605e 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -244,26 +244,7 @@ do { \ #define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) void bch2_pr_time_units(struct printbuf *, u64); - -#ifdef __KERNEL__ -static inline void pr_time(struct printbuf *out, u64 time) -{ - prt_printf(out, "%llu", time); -} -#else -#include -static inline void pr_time(struct printbuf *out, u64 _time) -{ - char time_str[64]; - time_t time = _time; - struct tm *tm = localtime(&time); - size_t err = strftime(time_str, sizeof(time_str), "%c", tm); - if (!err) - prt_printf(out, "(formatting error)"); - else - prt_printf(out, "%s", time_str); -} -#endif +void bch2_prt_date_seconds(struct printbuf *, time64_t); #ifdef __KERNEL__ static inline void uuid_unparse_lower(u8 *uuid, char *out) diff --git a/linux/closure.c b/linux/closure.c index 1faa24d6..f86c9eea 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -17,9 +17,8 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; - if ((flags & CLOSURE_GUARD_MASK) || - (!r && (flags & ~CLOSURE_DESTRUCTOR))) - panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r); + BUG_ON(flags & CLOSURE_GUARD_MASK); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); if (!r) { smp_acquire__after_ctrl_dep();