diff --git a/.bcachefs_revision b/.bcachefs_revision index e40e21f8..51df9f0e 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -f65603966f7474213e6bf22b046e374d01fd6639 +9abf628c701ad92670d697624f674cc01d42705e diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h new file mode 100644 index 00000000..c233e3ce --- /dev/null +++ b/include/linux/percpu-rwsem.h @@ -0,0 +1,72 @@ + +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PERCPU_RWSEM_H +#define _LINUX_PERCPU_RWSEM_H + +#include +#include + +struct percpu_rw_semaphore { + pthread_rwlock_t lock; +}; + +#define DEFINE_STATIC_PERCPU_RWSEM(name) \ +static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ +static struct percpu_rw_semaphore name = { \ + .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ + .read_count = &__percpu_rwsem_rc_##name, \ + .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ + .writer = __RCUWAIT_INITIALIZER(name.writer), \ +} + +extern int __percpu_down_read(struct percpu_rw_semaphore *, int); +extern void __percpu_up_read(struct percpu_rw_semaphore *); + +static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem) +{ + pthread_rwlock_rdlock(&sem->lock); + preempt_disable(); +} + +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + pthread_rwlock_rdlock(&sem->lock); +} + +static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) +{ + return !pthread_rwlock_tryrdlock(&sem->lock); +} + +static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem) +{ + preempt_enable(); + pthread_rwlock_unlock(&sem->lock); +} + +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +{ + pthread_rwlock_unlock(&sem->lock); +} + +static inline void percpu_down_write(struct percpu_rw_semaphore *sem) +{ + pthread_rwlock_wrlock(&sem->lock); +} + +static inline void percpu_up_write(struct percpu_rw_semaphore *sem) +{ + pthread_rwlock_unlock(&sem->lock); +} + +static inline void percpu_free_rwsem(struct percpu_rw_semaphore *sem) {} + +static inline int percpu_init_rwsem(struct percpu_rw_semaphore *sem) +{ + pthread_rwlock_init(&sem->lock, NULL); + return 0; +} + +#define percpu_rwsem_assert_held(sem) do {} while (0) + +#endif diff --git a/include/linux/string.h b/include/linux/string.h index abc191e7..ec35b8df 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -9,6 +9,7 @@ extern size_t strlcpy(char *dest, const char *src, size_t size); extern char *skip_spaces(const char *); extern char *strim(char *); extern void memzero_explicit(void *, size_t); +int match_string(const char * const *, size_t, const char *); #define kstrndup(s, n, gfp) strndup(s, n) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index a34574ca..13264b82 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -296,6 +296,11 @@ DEFINE_EVENT(btree_node, btree_compact, TP_ARGS(c, b) ); +DEFINE_EVENT(btree_node, btree_merge, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + DEFINE_EVENT(btree_node, btree_set_root, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index d29bdafa..29774e5d 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -176,34 +176,19 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type) return acl; } -int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) +int __bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - umode_t mode = inode->v.i_mode; int name_index; void *value = NULL; size_t size = 0; int ret; - if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&inode->v, &mode, &acl); - if (ret) - return ret; - } - switch (type) { case ACL_TYPE_ACCESS: name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->v.i_mode); - if (ret < 0) - return ret; - if (ret == 0) - acl = NULL; - } break; - case ACL_TYPE_DEFAULT: name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->v.i_mode)) @@ -220,20 +205,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) return (int)PTR_ERR(value); } - if (mode != inode->v.i_mode) { - mutex_lock(&inode->ei_update_lock); - inode->v.i_mode = mode; - inode->v.i_ctime = current_time(&inode->v); - - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); - - if (ret) - goto err; - } - ret = bch2_xattr_set(c, inode, "", value, size, 0, name_index); -err: kfree(value); if (ret == -ERANGE) @@ -245,4 +217,33 @@ err: return ret; } +int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + umode_t mode = inode->v.i_mode; + int ret; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(&inode->v, &mode, &acl); + if (ret) + return ret; + } + + ret = __bch2_set_acl(vinode, acl, type); + if (ret) + return ret; + + if (mode != inode->v.i_mode) { + mutex_lock(&inode->ei_update_lock); + inode->v.i_mode = mode; + inode->v.i_ctime = current_time(&inode->v); + + ret = bch2_write_inode(c, inode); + mutex_unlock(&inode->ei_update_lock); + } + + return ret; +} + #endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index b721330e..a66338d4 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -52,10 +52,16 @@ static inline int bch2_acl_count(size_t size) struct posix_acl; extern struct posix_acl *bch2_get_acl(struct inode *, int); +extern int __bch2_set_acl(struct inode *, struct posix_acl *, int); extern int bch2_set_acl(struct inode *, struct posix_acl *, int); #else +static inline int __bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return 0; +} + static inline int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { return 0; diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 44f9479e..ac2c7d1f 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -223,7 +223,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) if (a.k->p.offset >= ca->mi.nbuckets) return; - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); g = bucket(ca, a.k->p.offset); bucket_cmpxchg(g, new, ({ @@ -237,7 +237,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) g->io_time[WRITE] = get_alloc_field(&d, 2); - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); } int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) @@ -288,7 +288,7 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, size_t b, struct btree_iter *iter, - u64 *journal_seq) + u64 *journal_seq, bool nowait) { struct bucket_mark m; __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; @@ -296,6 +296,13 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, struct bkey_i_alloc *a; u8 *d; int ret; + unsigned flags = BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE; + + if (nowait) + flags |= BTREE_INSERT_NOWAIT; bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); @@ -304,7 +311,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, if (ret) break; - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); g = bucket(ca, b); /* read mark under btree node lock: */ @@ -320,14 +327,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, put_alloc_field(&d, 2, g->io_time[READ]); if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) put_alloc_field(&d, 2, g->io_time[WRITE]); - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOWAIT, + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags, BTREE_INSERT_ENTRY(iter, &a->k_i)); bch2_btree_iter_cond_resched(iter); } while (ret == -EINTR); @@ -352,7 +354,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL); + ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, + NULL, false); bch2_btree_iter_unlock(&iter); return ret; } @@ -372,7 +375,8 @@ int bch2_alloc_write(struct bch_fs *c) down_read(&ca->bucket_lock); for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, bucket, &iter, NULL); + ret = __bch2_alloc_write_key(c, ca, bucket, &iter, + NULL, false); if (ret) break; @@ -583,15 +587,20 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, { struct bucket_mark m; + percpu_down_read_preempt_disable(&c->usage_lock); spin_lock(&c->freelist_lock); + if (!bch2_invalidate_bucket(c, ca, bucket, &m)) { spin_unlock(&c->freelist_lock); + percpu_up_read_preempt_enable(&c->usage_lock); return; } verify_not_on_freelist(c, ca, bucket); BUG_ON(!fifo_push(&ca->free_inc, bucket)); + spin_unlock(&c->freelist_lock); + percpu_up_read_preempt_enable(&c->usage_lock); /* gc lock held: */ bucket_io_clock_reset(c, ca, bucket, READ); @@ -812,7 +821,8 @@ static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca) } static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, size_t nr) + u64 *journal_seq, size_t nr, + bool nowait) { struct btree_iter iter; int ret = 0; @@ -820,14 +830,12 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - /* - * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the - * btree update or journal_res_get - */ + /* Only use nowait if we've already invalidated at least one bucket: */ while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) { size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated); - ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq); + ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq, + nowait && ca->nr_invalidated); if (ret) break; @@ -835,7 +843,9 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, } bch2_btree_iter_unlock(&iter); - return ret; + + /* If we used NOWAIT, don't return the error: */ + return ca->nr_invalidated ? 0 : ret; } static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) @@ -943,7 +953,8 @@ static int bch2_allocator_thread(void *arg) fifo_used(&ca->free_inc)); journal_seq = 0; - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX); + ret = bch2_invalidate_free_inc(c, ca, &journal_seq, + SIZE_MAX, true); if (ret) { bch_err(ca, "error invalidating buckets: %i", ret); goto stop; @@ -1077,11 +1088,15 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + percpu_down_read_preempt_disable(&c->usage_lock); spin_lock(&ob->lock); + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false, gc_pos_alloc(c, ob), 0); ob->valid = false; + spin_unlock(&ob->lock); + percpu_up_read_preempt_enable(&c->usage_lock); spin_lock(&c->freelist_lock); ob->freelist = c->open_buckets_freelist; @@ -1151,6 +1166,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, long bucket; spin_lock(&c->freelist_lock); + if (may_alloc_partial && ca->open_buckets_partial_nr) { int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; @@ -1202,7 +1218,6 @@ out: ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - lg_local_lock(&c->usage_lock); buckets = bucket_array(ca); ob->valid = true; @@ -1215,8 +1230,6 @@ out: bucket_io_clock_reset(c, ca, bucket, READ); bucket_io_clock_reset(c, ca, bucket, WRITE); - - lg_local_unlock(&c->usage_lock); spin_unlock(&ob->lock); spin_unlock(&c->freelist_lock); @@ -1296,7 +1309,6 @@ static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c, if (nr_ptrs_effective >= nr_replicas) return ALLOC_SUCCESS; - rcu_read_lock(); devs_sorted = bch2_wp_alloc_list(c, wp, devs); for (i = 0; i < devs_sorted.nr; i++) { @@ -1337,7 +1349,6 @@ static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c, break; } } - rcu_read_unlock(); EBUG_ON(reserve == RESERVE_MOVINGGC && ret != ALLOC_SUCCESS && @@ -1422,8 +1433,13 @@ static int open_bucket_add_buckets(struct bch_fs *c, struct closure *cl) { struct bch_devs_mask devs = c->rw_devs[wp->type]; + const struct bch_devs_mask *t; struct open_bucket *ob; unsigned i; + int ret; + + percpu_down_read_preempt_disable(&c->usage_lock); + rcu_read_lock(); /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < devs_have->nr; i++) @@ -1432,17 +1448,16 @@ static int open_bucket_add_buckets(struct bch_fs *c, writepoint_for_each_ptr_all(wp, ob, i) __clear_bit(ob->ptr.dev, devs.d); - if (target) { - const struct bch_devs_mask *t; + t = bch2_target_to_mask(c, target); + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); - rcu_read_lock(); - t = bch2_target_to_mask(c, target); - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); - rcu_read_unlock(); - } + ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl); - return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl); + rcu_read_unlock(); + percpu_up_read_preempt_enable(&c->usage_lock); + + return ret; } static struct write_point *__writepoint_find(struct hlist_head *head, @@ -1980,10 +1995,12 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) if (!is_available_bucket(m) || m.cached_sectors) continue; + percpu_down_read_preempt_disable(&c->usage_lock); bch2_mark_alloc_bucket(c, ca, bu, true, gc_pos_alloc(c, NULL), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); + percpu_up_read_preempt_enable(&c->usage_lock); fifo_push(&ca->free_inc, bu); ca->nr_invalidated++; @@ -2051,7 +2068,8 @@ not_enough: for_each_rw_member(ca, c, dev_iter) { ret = bch2_invalidate_free_inc(c, ca, &journal_seq, - ca->free[RESERVE_BTREE].size); + ca->free[RESERVE_BTREE].size, + false); if (ret) { percpu_ref_put(&ca->io_ref); return ret; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 4219c46c..4702b016 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -182,10 +182,10 @@ #include #include #include -#include #include #include #include +#include #include #include #include @@ -302,21 +302,14 @@ enum bch_time_stats { #include "rebalance_types.h" #include "super_types.h" -/* - * Number of nodes we might have to allocate in a worst case btree split - * operation - we split all the way up to the root, then allocate a new root. - */ -#define btree_reserve_required_nodes(depth) (((depth) + 1) * 2 + 1) - /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U /* Maximum number of nodes we might need to allocate atomically: */ -#define BTREE_RESERVE_MAX \ - (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES) +#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) /* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) +#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) struct btree; @@ -591,7 +584,7 @@ struct bch_fs { struct bch_fs_usage __percpu *usage_percpu; struct bch_fs_usage usage_cached; - struct lglock usage_lock; + struct percpu_rw_semaphore usage_lock; struct closure_waitlist freelist_wait; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index cd5ebfbe..02b14e38 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -118,20 +118,17 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k, unsigned flags) { struct gc_pos pos = { 0 }; - struct bch_fs_usage *stats; u8 ret = 0; - preempt_disable(); - stats = this_cpu_ptr(c->usage_percpu); switch (type) { case BKEY_TYPE_BTREE: - bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats, + bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL, 0, flags| BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); break; case BKEY_TYPE_EXTENTS: - bch2_mark_key(c, k, k.k->size, false, pos, stats, + bch2_mark_key(c, k, k.k->size, false, pos, NULL, 0, flags| BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -140,7 +137,6 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, default: BUG(); } - preempt_enable(); return ret; } @@ -320,8 +316,10 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, unsigned i; u64 b; - if (c) + if (c) { lockdep_assert_held(&c->sb_lock); + percpu_down_read_preempt_disable(&c->usage_lock); + } for (i = 0; i < layout->nr_superblocks; i++) { u64 offset = le64_to_cpu(layout->sb_offset[i]); @@ -345,8 +343,10 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, gc_phase(GC_PHASE_SB), flags); } - if (c) + if (c) { + percpu_up_read_preempt_enable(&c->usage_lock); spin_unlock(&c->journal.lock); + } } static void bch2_mark_superblocks(struct bch_fs *c) @@ -397,6 +397,8 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) size_t i, j, iter; unsigned ci; + percpu_down_read_preempt_disable(&c->usage_lock); + spin_lock(&c->freelist_lock); gc_pos_set(c, gc_pos_alloc(c, NULL)); @@ -433,6 +435,8 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) } spin_unlock(&ob->lock); } + + percpu_up_read_preempt_enable(&c->usage_lock); } static void bch2_gc_start(struct bch_fs *c) @@ -444,7 +448,7 @@ static void bch2_gc_start(struct bch_fs *c) size_t b; int cpu; - lg_global_lock(&c->usage_lock); + percpu_down_write(&c->usage_lock); /* * Indicates to buckets code that gc is now in progress - done under @@ -470,7 +474,7 @@ static void bch2_gc_start(struct bch_fs *c) memset(p->s, 0, sizeof(p->s)); } - lg_global_unlock(&c->usage_lock); + percpu_up_write(&c->usage_lock); /* Clear bucket marks: */ for_each_member_device(ca, c, i) { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 70c3132e..95ee9f61 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -152,7 +152,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, * the prev sibling in btree node merging: */ if (iter->nodes_locked && - __ffs(iter->nodes_locked) == level && + __ffs(iter->nodes_locked) <= level && __btree_iter_cmp(iter->btree_id, pos, iter)) return false; @@ -592,6 +592,8 @@ static inline void __btree_iter_init(struct btree_iter *iter, /* Skip to first non whiteout: */ if (b->level) bch2_btree_node_iter_peek(&l->iter, b); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } static inline void btree_iter_node_set(struct btree_iter *iter, @@ -1084,6 +1086,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); EBUG_ON(iter->flags & BTREE_ITER_SLOTS); + EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && + !btree_node_locked(iter, 0)); if (iter->uptodate == BTREE_ITER_UPTODATE) { struct bkey_packed *k = @@ -1093,8 +1097,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) .v = bkeyp_val(&l->b->format, k) }; - EBUG_ON(!btree_node_locked(iter, 0)); - if (debug_check_bkeys(iter->c)) bch2_bkey_debugcheck(iter->c, l->b, ret); return ret; @@ -1257,16 +1259,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); + EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && + !btree_node_locked(iter, 0)); if (iter->uptodate == BTREE_ITER_UPTODATE) { - struct bkey_s_c ret = { .k = &iter->k };; + struct bkey_s_c ret = { .k = &iter->k }; if (!bkey_deleted(&iter->k)) ret.v = bkeyp_val(&l->b->format, __bch2_btree_node_iter_peek_all(&l->iter, l->b)); - EBUG_ON(!btree_node_locked(iter, 0)); - if (debug_check_bkeys(iter->c)) bch2_bkey_debugcheck(iter->c, l->b, ret); return ret; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index c3ecc1e9..92e19c4e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1564,11 +1564,15 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, struct btree_update *as; struct closure cl; int ret = 0; + struct btree_iter *linked; /* * We already have a disk reservation and open buckets pinned; this * allocation must not block: */ + for_each_linked_btree_iter(iter, linked) + if (linked->btree_id == BTREE_ID_EXTENTS) + btree_reserve_flags |= BTREE_INSERT_USE_RESERVE; if (iter->btree_id == BTREE_ID_EXTENTS) btree_reserve_flags |= BTREE_INSERT_USE_RESERVE; @@ -1704,15 +1708,17 @@ retry: } as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - &cl); + btree_update_reserve_required(c, parent) + 1, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + &cl); if (IS_ERR(as)) { ret = PTR_ERR(as); goto out_unlock; } + trace_btree_merge(c, b); + bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); @@ -1778,8 +1784,10 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, struct btree_update *as; as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - flags, cl); + (parent + ? btree_update_reserve_required(c, parent) + : 0) + 1, + flags, cl); if (IS_ERR(as)) { trace_btree_gc_rewrite_node_fail(c, b); return PTR_ERR(as); @@ -1966,6 +1974,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, struct btree *b, struct bkey_i_extent *new_key) { + struct btree *parent = btree_node_parent(iter, b); struct btree_update *as = NULL; struct btree *new_hash = NULL; struct closure cl; @@ -2003,11 +2012,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, } as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); + parent ? btree_update_reserve_required(c, parent) : 0, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE, + &cl); + if (IS_ERR(as)) { ret = PTR_ERR(as); if (ret == -EAGAIN) diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 25bfc7ab..abf14e4c 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -183,9 +183,14 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) { - unsigned depth = btree_node_root(c, b)->level - b->level; + unsigned depth = btree_node_root(c, b)->level - b->level + 1; - return btree_reserve_required_nodes(depth); + /* + * Number of nodes we might have to allocate in a worst case btree + * split operation - we split all the way up to the root, then allocate + * a new root. + */ + return depth * 2 + 1; } static inline void btree_node_reset_sib_u64s(struct btree *b) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 5dda22c7..b17189ee 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -331,7 +331,7 @@ void bch2_fs_usage_apply(struct bch_fs *c, stats->online_reserved -= added; } - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); /* online_reserved not subject to gc: */ this_cpu_ptr(c->usage_percpu)->online_reserved += stats->online_reserved; @@ -341,7 +341,7 @@ void bch2_fs_usage_apply(struct bch_fs *c, bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats); bch2_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); memset(stats, 0, sizeof(*stats)); } @@ -352,7 +352,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_dev_usage *dev_usage; if (c) - lockdep_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->usage_lock); if (old.data_type && new.data_type && old.data_type != new.data_type) { @@ -399,12 +399,13 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, struct bucket *g; struct bucket_mark new; - lg_local_lock(&c->usage_lock); + percpu_rwsem_assert_held(&c->usage_lock); + g = bucket(ca, b); *old = bucket_data_cmpxchg(c, ca, g, new, ({ if (!is_available_bucket(new)) { - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); return false; } @@ -414,7 +415,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.dirty_sectors = 0; new.gen++; })); - lg_local_unlock(&c->usage_lock); if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), @@ -429,19 +429,16 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, struct bucket *g; struct bucket_mark old, new; - lg_local_lock(&c->usage_lock); + percpu_rwsem_assert_held(&c->usage_lock); g = bucket(ca, b); if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) { - lg_local_unlock(&c->usage_lock); + gc_will_visit(c, pos)) return; - } old = bucket_data_cmpxchg(c, ca, g, new, ({ new.owned_by_allocator = owned_by_allocator; })); - lg_local_unlock(&c->usage_lock); BUG_ON(!owned_by_allocator && !old.owned_by_allocator && c->gc_pos.phase == GC_PHASE_DONE); @@ -471,16 +468,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!type); if (likely(c)) { - lg_local_lock(&c->usage_lock); + percpu_rwsem_assert_held(&c->usage_lock); if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) { - lg_local_unlock(&c->usage_lock); + gc_will_visit(c, pos)) return; - } } - preempt_disable(); + rcu_read_lock(); g = bucket(ca, b); old = bucket_data_cmpxchg(c, ca, g, new, ({ @@ -489,10 +484,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, new.data_type = type; })); - preempt_enable(); - - if (likely(c)) - lg_local_unlock(&c->usage_lock); + rcu_read_unlock(); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); @@ -654,11 +646,14 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, * (e.g. the btree node lock, or the relevant allocator lock). */ - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) flags |= BCH_BUCKET_MARK_GC_WILL_VISIT; + if (!stats) + stats = this_cpu_ptr(c->usage_percpu); + switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: { @@ -693,7 +688,7 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, break; } } - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); } /* Disk reservations: */ @@ -711,19 +706,19 @@ static u64 __recalc_sectors_available(struct bch_fs *c) /* Used by gc when it's starting: */ void bch2_recalc_sectors_available(struct bch_fs *c) { - lg_global_lock(&c->usage_lock); + percpu_down_write(&c->usage_lock); atomic64_set(&c->sectors_available, __recalc_sectors_available(c)); - lg_global_unlock(&c->usage_lock); + percpu_up_write(&c->usage_lock); } void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); this_cpu_sub(c->usage_percpu->online_reserved, res->sectors); bch2_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); res->sectors = 0; } @@ -738,7 +733,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, s64 sectors_available; int ret; - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); stats = this_cpu_ptr(c->usage_percpu); if (sectors <= stats->available_cache) @@ -750,7 +745,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, get = min((u64) sectors + SECTORS_CACHE, old); if (get < sectors) { - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); goto recalculate; } } while ((v = atomic64_cmpxchg(&c->sectors_available, @@ -765,7 +760,7 @@ out: bch2_disk_reservations_verify(c, flags); bch2_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); return 0; recalculate: @@ -785,8 +780,8 @@ recalculate: else if (!down_read_trylock(&c->gc_lock)) return -EINTR; } - lg_global_lock(&c->usage_lock); + percpu_down_write(&c->usage_lock); sectors_available = __recalc_sectors_available(c); if (sectors <= sectors_available || @@ -804,7 +799,8 @@ recalculate: } bch2_fs_stats_verify(c); - lg_global_unlock(&c->usage_lock); + percpu_up_write(&c->usage_lock); + if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) up_read(&c->gc_lock); @@ -874,7 +870,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (resize) { down_write(&c->gc_lock); down_write(&ca->bucket_lock); - lg_global_lock(&c->usage_lock); + percpu_down_write(&c->usage_lock); } old_buckets = bucket_array(ca); @@ -900,7 +896,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) swap(ca->buckets_dirty, buckets_dirty); if (resize) - lg_global_unlock(&c->usage_lock); + percpu_up_write(&c->usage_lock); spin_lock(&c->freelist_lock); for (i = 0; i < RESERVE_NR; i++) { diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index aefe6027..4deb6c37 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -32,7 +32,7 @@ static inline struct bucket_array *bucket_array(struct bch_dev *ca) { return rcu_dereference_check(ca->buckets, !ca->fs || - lockdep_is_held(&ca->fs->usage_lock) || + percpu_rwsem_is_held(&ca->fs->usage_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); } diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index cd200cbe..87f3940e 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -176,6 +176,8 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe struct target t = target_decode(target); switch (t.type) { + case TARGET_NULL: + return NULL; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 1d9464af..d7b17195 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1702,6 +1702,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct bio *bio; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); + size_t shorten; ssize_t ret; if ((offset|iter->count) & (block_bytes(c) - 1)) @@ -1709,11 +1710,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) ret = min_t(loff_t, iter->count, max_t(loff_t, 0, i_size_read(&inode->v) - offset)); - iov_iter_truncate(iter, round_up(ret, block_bytes(c))); if (!ret) return ret; + shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + iter->count -= shorten; + bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), &c->dio_read_bioset); @@ -1769,6 +1772,8 @@ start: bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); } + iter->count += shorten; + if (sync) { closure_sync(&dio->cl); closure_debug_destroy(&dio->cl); @@ -1822,6 +1827,13 @@ static long bch2_dio_write_loop(struct dio_write *dio) if (unlikely(ret < 0)) goto err; + /* gup might have faulted pages back in: */ + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos + (dio->iop.op.written << 9), + req->ki_pos + iov_iter_count(&dio->iter) - 1); + if (unlikely(ret)) + goto err; + dio->iop.op.pos = POS(inode->v.i_ino, (req->ki_pos >> 9) + dio->iop.op.written); @@ -2280,7 +2292,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, loff_t new_size; int ret; - if ((offset | len) & (PAGE_SIZE - 1)) + if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS, @@ -2354,8 +2366,11 @@ static long bch2_fcollapse(struct bch_inode_info *inode, btree_iter_err: if (ret == -EINTR) ret = 0; - if (ret) + if (ret) { + bch2_btree_iter_unlock(&src); + bch2_btree_iter_unlock(&dst); goto err_put_sectors_dirty; + } /* * XXX: if we error here we've left data with multiple * pointers... which isn't a _super_ serious problem... @@ -2368,7 +2383,7 @@ btree_iter_err: bch2_btree_iter_unlock(&dst); ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(new_size, PAGE_SIZE) >> 9, + round_up(new_size, block_bytes(c)) >> 9, &i_sectors_hook.hook, &inode->ei_journal_seq); if (ret) @@ -2381,9 +2396,6 @@ err_put_sectors_dirty: err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); - - bch2_btree_iter_unlock(&src); - bch2_btree_iter_unlock(&dst); return ret; } @@ -2483,7 +2495,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, &i_sectors_hook.quota_res, sectors, true); if (unlikely(ret)) - goto err_put_sectors_dirty; + goto btree_iter_err; } if (reservation.v.nr_replicas < replicas || @@ -2491,7 +2503,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, ret = bch2_disk_reservation_get(c, &disk_res, sectors, replicas, 0); if (unlikely(ret)) - goto err_put_sectors_dirty; + goto btree_iter_err; reservation.v.nr_replicas = disk_res.nr_replicas; } @@ -2503,8 +2515,12 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, BTREE_INSERT_ENTRY(&iter, &reservation.k_i)); bch2_disk_reservation_put(c, &disk_res); btree_iter_err: - if (ret < 0 && ret != -EINTR) + if (ret == -EINTR) + ret = 0; + if (ret) { + bch2_btree_iter_unlock(&iter); goto err_put_sectors_dirty; + } } bch2_btree_iter_unlock(&iter); @@ -2544,7 +2560,6 @@ btree_iter_err: err_put_sectors_dirty: ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err: - bch2_btree_iter_unlock(&iter); pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); return ret; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index fb30f0d9..dc6c651d 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -243,13 +243,13 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c, atomic_long_inc(&c->nr_inodes); if (default_acl) { - ret = bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT); + ret = __bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT); if (unlikely(ret)) goto err; } if (acl) { - ret = bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS); + ret = __bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS); if (unlikely(ret)) goto err; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index c554a987..048b5c10 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -747,8 +747,13 @@ up: } for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) { - if (k.k->type != BCH_INODE_FS || - !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) + if (k.k->type != BCH_INODE_FS) + continue; + + if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) + continue; + + if (!bch2_empty_dir(c, k.k->p.inode)) continue; if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 3762fb92..f26d4041 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1698,9 +1698,9 @@ noclone: if (!rbio->have_ioref) goto no_device_postclone; - lg_local_lock(&c->usage_lock); + percpu_down_read_preempt_disable(&c->usage_lock); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - lg_local_unlock(&c->usage_lock); + percpu_up_read_preempt_enable(&c->usage_lock); this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], bio_sectors(&rbio->bio)); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index ea67af3d..addd51f0 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -725,7 +725,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, long bucket; if (new_fs) { + percpu_down_read_preempt_disable(&c->usage_lock); bucket = bch2_bucket_alloc_new_fs(ca); + percpu_up_read_preempt_enable(&c->usage_lock); + if (bucket < 0) { ret = -ENOSPC; goto err; @@ -741,8 +744,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bucket = sector_to_bucket(ca, ob->ptr.offset); } - if (c) + if (c) { + percpu_down_read_preempt_disable(&c->usage_lock); spin_lock(&c->journal.lock); + } __array_insert_item(ja->buckets, ja->nr, ja->last_idx); __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx); @@ -759,9 +764,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } ja->nr++; - if (c) - spin_unlock(&c->journal.lock); - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), @@ -769,6 +771,11 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE : 0); + if (c) { + spin_unlock(&c->journal.lock); + percpu_up_read_preempt_enable(&c->usage_lock); + } + if (!new_fs) bch2_open_bucket_put(c, ob); } diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 8db8096e..8e655bc1 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -218,7 +218,7 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, return -ERANGE; break; case BCH_OPT_STR: - ret = bch2_read_string_list(val, opt->choices); + ret = match_string(opt->choices, -1, val); if (ret < 0) return ret; diff --git a/libbcachefs/six.h b/libbcachefs/six.h index f518c64c..999c49db 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -1,6 +1,61 @@ #ifndef _BCACHEFS_SIX_H #define _BCACHEFS_SIX_H +/* + * Shared/intent/exclusive locks: sleepable read/write locks, much like rw + * semaphores, except with a third intermediate state, intent. Basic operations + * are: + * + * six_lock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * + * six_lock_intent(&foo->lock); + * six_unlock_intent(&foo->lock); + * + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * + * Intent locks block other intent locks, but do not block read locks, and you + * must have an intent lock held before taking a write lock, like so: + * + * six_lock_intent(&foo->lock); + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * six_unlock_intent(&foo->lock); + * + * Other operations: + * + * six_trylock_read() + * six_trylock_intent() + * six_trylock_write() + * + * six_lock_downgrade(): convert from intent to read + * six_lock_tryupgrade(): attempt to convert from read to intent + * + * Locks also embed a sequence number, which is incremented when the lock is + * locked or unlocked for write. The current sequence number can be grabbed + * while a lock is held from lock->state.seq; then, if you drop the lock you can + * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock + * iff it hasn't been locked for write in the meantime. + * + * There are also operations that take the lock type as a parameter, where the + * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: + * + * six_lock_type(lock, type) + * six_unlock_type(lock, type) + * six_relock(lock, type, seq) + * six_trylock_type(lock, type) + * six_trylock_convert(lock, from, to) + * + * A lock may be held multiple types by the same thread (for read or intent, + * not write) - up to SIX_LOCK_MAX_RECURSE. However, the six locks code does + * _not_ implement the actual recursive checks itself though - rather, if your + * code (e.g. btree iterator code) knows that the current thread already has a + * lock held, and for the correct type, six_lock_increment() may be used to + * bump up the counter for that type - the only effect is that one more call to + * unlock will be required before the lock is unlocked. + */ + #include #include #include @@ -10,21 +65,6 @@ #define SIX_LOCK_SEPARATE_LOCKFNS -/* - * LOCK STATES: - * - * read, intent, write (i.e. shared/intent/exclusive, hence the name) - * - * read and write work as with normal read/write locks - a lock can have - * multiple readers, but write excludes reads and other write locks. - * - * Intent does not block read, but it does block other intent locks. The idea is - * by taking an intent lock, you can then later upgrade to a write lock without - * dropping your read lock and without deadlocking - because no other thread has - * the intent lock and thus no other thread could be trying to take the write - * lock. - */ - union six_lock_state { struct { atomic64_t counter; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 55da242c..1eab7c77 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -412,7 +412,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - lg_lock_free(&c->usage_lock); + percpu_free_rwsem(&c->usage_lock); free_percpu(c->usage_percpu); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); @@ -643,7 +643,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || - lg_lock_init(&c->usage_lock) || + percpu_init_rwsem(&c->usage_lock) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || bch2_io_clock_init(&c->io_clock[READ]) || @@ -1215,6 +1215,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; + bch2_dev_sysfs_online(c, ca); + if (c->sb.nr_devices == 1) bdevname(ca->disk_sb.bdev, c->name); bdevname(ca->disk_sb.bdev, ca->name); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 5e341a71..66b5b9f9 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -921,7 +921,7 @@ STORE(bch2_dev) } if (attr == &sysfs_cache_replacement_policy) { - ssize_t v = bch2_read_string_list(buf, bch2_cache_replacement_policies); + ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); if (v < 0) return v; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 60e1f1ff..e263dd20 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -126,24 +126,6 @@ ssize_t bch2_scnprint_string_list(char *buf, size_t size, return out - buf; } -ssize_t bch2_read_string_list(const char *buf, const char * const list[]) -{ - size_t i, len; - - buf = skip_spaces(buf); - - len = strlen(buf); - while (len && isspace(buf[len - 1])) - --len; - - for (i = 0; list[i]; i++) - if (strlen(list[i]) == len && - !memcmp(buf, list[i], len)) - break; - - return list[i] ? i : -EINVAL; -} - ssize_t bch2_scnprint_flag_list(char *buf, size_t size, const char * const list[], u64 flags) { @@ -178,7 +160,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[]) s = strim(d); while ((p = strsep(&s, ","))) { - int flag = bch2_read_string_list(p, list); + int flag = match_string(list, -1, p); if (flag < 0) { ret = -1; break; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 18491559..487591c4 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -227,57 +227,6 @@ do { \ heap_sift_down(heap, _i, cmp); \ } while (0) -/* - * Simple array based allocator - preallocates a number of elements and you can - * never allocate more than that, also has no locking. - * - * Handy because if you know you only need a fixed number of elements you don't - * have to worry about memory allocation failure, and sometimes a mempool isn't - * what you want. - * - * We treat the free elements as entries in a singly linked list, and the - * freelist as a stack - allocating and freeing push and pop off the freelist. - */ - -#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ - struct { \ - type *freelist; \ - type data[size]; \ - } name - -#define array_alloc(array) \ -({ \ - typeof((array)->freelist) _ret = (array)->freelist; \ - \ - if (_ret) \ - (array)->freelist = *((typeof((array)->freelist) *) _ret);\ - \ - _ret; \ -}) - -#define array_free(array, ptr) \ -do { \ - typeof((array)->freelist) _ptr = ptr; \ - \ - *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ - (array)->freelist = _ptr; \ -} while (0) - -#define array_allocator_init(array) \ -do { \ - typeof((array)->freelist) _i; \ - \ - BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ - (array)->freelist = NULL; \ - \ - for (_i = (array)->data; \ - _i < (array)->data + ARRAY_SIZE((array)->data); \ - _i++) \ - array_free(array, _i); \ -} while (0) - -#define array_freelist_empty(array) ((array)->freelist == NULL) - #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) @@ -359,8 +308,6 @@ bool bch2_is_zero(const void *, size_t); ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t); -ssize_t bch2_read_string_list(const char *, const char * const[]); - ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64); u64 bch2_read_flag_list(char *, const char * const[]); diff --git a/linux/string.c b/linux/string.c index 0f23f074..4fa3f64b 100644 --- a/linux/string.c +++ b/linux/string.c @@ -95,3 +95,19 @@ void memzero_explicit(void *s, size_t count) memset(s, 0, count); barrier_data(s); } + +int match_string(const char * const *array, size_t n, const char *string) +{ + int index; + const char *item; + + for (index = 0; index < n; index++) { + item = array[index]; + if (!item) + break; + if (!strcmp(item, string)) + return index; + } + + return -EINVAL; +} diff --git a/tools-util.c b/tools-util.c index 8474ab06..ca6d89a5 100644 --- a/tools-util.c +++ b/tools-util.c @@ -218,7 +218,7 @@ u64 read_file_u64(int dirfd, const char *path) ssize_t read_string_list_or_die(const char *opt, const char * const list[], const char *msg) { - ssize_t v = bch2_read_string_list(opt, list); + ssize_t v = match_string(list, -1, opt); if (v < 0) die("Bad %s %s", msg, opt);