Update bcachefs sources to ad68801b93 bcachefs: Use pcpu mode of six locks for interior nodes

2025-12-11 00:00:12 +03:00 · 2021-03-24 22:13:00 -04:00 · 2021-03-24 22:13:00 -04:00 · c88113e255
commit c88113e255
parent 9f69a652dc
8 changed files with 301 additions and 111 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-c7defb5793039b55066e8e9d41e76bae826a7894
+ad68801b939cdda0530f54cd07b3212e98fe1d75
--- a/include/linux/six.h
+++ b/include/linux/six.h
@ -80,7 +80,8 @@ union six_lock_state {
 	};

 	struct {
-		unsigned	read_lock:28;
+		unsigned	read_lock:27;
+		unsigned	write_locking:1;
 		unsigned	intent_lock:1;
 		unsigned	waiters:3;
 		/*
@ -107,6 +108,7 @@ struct six_lock {
 	unsigned		intent_lock_recurse;
 	struct task_struct	*owner;
 	struct optimistic_spin_queue osq;
+	unsigned __percpu	*readers;

 	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list[2];
@ -194,4 +196,7 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);

 void six_lock_wakeup_all(struct six_lock *);

+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
 #endif /* _LINUX_SIX_H */
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@ -146,6 +146,11 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 	b->c.level	= level;
 	b->c.btree_id	= id;

+	if (level)
+		six_lock_pcpu_alloc(&b->c.lock);
+	else
+		six_lock_pcpu_free(&b->c.lock);
+
 	mutex_lock(&bc->lock);
 	ret = __bch2_btree_node_hash_insert(bc, b);
 	if (!ret)
@ -386,6 +391,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	while (!list_empty(&bc->freed)) {
 		b = list_first_entry(&bc->freed, struct btree, list);
 		list_del(&b->list);
+		six_lock_pcpu_free(&b->c.lock);
 		kfree(b);
 	}

--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -167,7 +167,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
 	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	struct extent_ptr_decoded p = { 0 };
 	bool do_update = false;
 	int ret = 0;

--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -988,6 +988,11 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	list_del_init(&b->list);
 	mutex_unlock(&c->btree_cache.lock);

+	if (b->c.level)
+		six_lock_pcpu_alloc(&b->c.lock);
+	else
+		six_lock_pcpu_free(&b->c.lock);
+
 	mutex_lock(&c->btree_root_lock);
 	BUG_ON(btree_node_root(c, b) &&
 	       (b->c.level < btree_node_root(c, b)->c.level ||
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -632,7 +632,7 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
+	struct extent_ptr_decoded p = { 0 };
 	unsigned replicas = 0;

 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -883,7 +883,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 	ret = bch2_move_btree(c,
 			      0,		POS_MIN,
 			      BTREE_ID_NR,	POS_MAX,
-			      rewrite_old_nodes_pred, c, stats) ?: ret;
+			      rewrite_old_nodes_pred, c, stats);
 	if (!ret) {
 		mutex_lock(&c->sb_lock);
 		c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
--- a/linux/six.c
+++ b/linux/six.c
@ -2,6 +2,7 @@

 #include <linux/export.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@ -41,7 +42,7 @@ struct six_lock_vals {
 #define LOCK_VALS {							\
 	[SIX_LOCK_read] = {						\
 		.lock_val	= __SIX_VAL(read_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_write,		\
+		.lock_fail	= __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
 		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
 		.held_mask	= __SIX_LOCK_HELD_read,			\
 		.unlock_wakeup	= SIX_LOCK_write,			\
@ -76,36 +77,195 @@ static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
 	}
 }

+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+	unsigned read_count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		read_count += *per_cpu_ptr(lock->readers, cpu);
+	return read_count;
+}
+
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+				   union six_lock_state state,
+				   unsigned waitlist_id)
+{
+	if (waitlist_id == SIX_LOCK_write) {
+		if (state.write_locking && !state.read_lock) {
+			struct task_struct *p = READ_ONCE(lock->owner);
+			if (p)
+				wake_up_process(p);
+		}
+	} else {
+		struct list_head *wait_list = &lock->wait_list[waitlist_id];
+		struct six_lock_waiter *w, *next;
+
+		if (!(state.waiters & (1 << waitlist_id)))
+			return;
+
+		clear_bit(waitlist_bitnr(waitlist_id),
+			  (unsigned long *) &lock->state.v);
+
+		raw_spin_lock(&lock->wait_lock);
+
+		list_for_each_entry_safe(w, next, wait_list, list) {
+			list_del_init(&w->list);
+
+			if (wake_up_process(w->task) &&
+			    waitlist_id != SIX_LOCK_read) {
+				if (!list_empty(wait_list))
+					set_bit(waitlist_bitnr(waitlist_id),
+						(unsigned long *) &lock->state.v);
+				break;
+			}
+		}
+
+		raw_spin_unlock(&lock->wait_lock);
+	}
+}
+
 static __always_inline bool do_six_trylock_type(struct six_lock *lock,
-						enum six_lock_type type)
+						enum six_lock_type type,
+						bool try)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v = READ_ONCE(lock->state.v);
+	union six_lock_state old, new;
+	bool ret;
+	u64 v;

 	EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));

-	do {
-		old.v = v;
+	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));

-		EBUG_ON(type == SIX_LOCK_write &&
-			((old.v & __SIX_LOCK_HELD_write) ||
-			 !(old.v & __SIX_LOCK_HELD_intent)));
+	/*
+	 * Percpu reader mode:
+	 *
+	 * The basic idea behind this algorithm is that you can implement a lock
+	 * between two threads without any atomics, just memory barriers:
+	 *
+	 * For two threads you'll need two variables, one variable for "thread a
+	 * has the lock" and another for "thread b has the lock".
+	 *
+	 * To take the lock, a thread sets its variable indicating that it holds
+	 * the lock, then issues a full memory barrier, then reads from the
+	 * other thread's variable to check if the other thread thinks it has
+	 * the lock. If we raced, we backoff and retry/sleep.
+	 */

-		if (old.v & l[type].lock_fail)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
+	if (type == SIX_LOCK_read && lock->readers) {
+retry:
+		preempt_disable();
+		this_cpu_inc(*lock->readers); /* signal that we own lock */

-	six_set_owner(lock, type, old);
-	return true;
+		smp_mb();
+
+		old.v = READ_ONCE(lock->state.v);
+		ret = !(old.v & l[type].lock_fail);
+
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		/*
+		 * If we failed because a writer was trying to take the
+		 * lock, issue a wakeup because we might have caused a
+		 * spurious trylock failure:
+		 */
+		if (old.write_locking) {
+			struct task_struct *p = READ_ONCE(lock->owner);
+
+			if (p)
+				wake_up_process(p);
+		}
+
+		/*
+		 * If we failed from the lock path and the waiting bit wasn't
+		 * set, set it:
+		 */
+		if (!try && !ret) {
+			v = old.v;
+
+			do {
+				new.v = old.v = v;
+
+				if (!(old.v & l[type].lock_fail))
+					goto retry;
+
+				if (new.waiters & (1 << type))
+					break;
+
+				new.waiters |= 1 << type;
+			} while ((v = atomic64_cmpxchg(&lock->state.counter,
+						       old.v, new.v)) != old.v);
+		}
+	} else if (type == SIX_LOCK_write && lock->readers) {
+		if (try) {
+			atomic64_add(__SIX_VAL(write_locking, 1),
+				     &lock->state.counter);
+			smp_mb__after_atomic();
+		}
+
+		ret = !pcpu_read_count(lock);
+
+		/*
+		 * On success, we increment lock->seq; also we clear
+		 * write_locking unless we failed from the lock path:
+		 */
+		v = 0;
+		if (ret)
+			v += __SIX_VAL(seq, 1);
+		if (ret || try)
+			v -= __SIX_VAL(write_locking, 1);
+
+		if (try && !ret) {
+			old.v = atomic64_add_return(v, &lock->state.counter);
+			six_lock_wakeup(lock, old, SIX_LOCK_read);
+		} else {
+			atomic64_add(v, &lock->state.counter);
+		}
+	} else {
+		v = READ_ONCE(lock->state.v);
+		do {
+			new.v = old.v = v;
+
+			if (!(old.v & l[type].lock_fail)) {
+				new.v += l[type].lock_val;
+
+				if (type == SIX_LOCK_write)
+					new.write_locking = 0;
+			} else if (!try && type != SIX_LOCK_write &&
+				   !(new.waiters & (1 << type)))
+				new.waiters |= 1 << type;
+			else
+				break; /* waiting bit already set */
+		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+					old.v, new.v)) != old.v);
+
+		ret = !(old.v & l[type].lock_fail);
+	}
+
+	if (ret)
+		six_set_owner(lock, type, old);
+
+	EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+	EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+	return ret;
 }

 __always_inline __flatten
 static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 {
-	if (!do_six_trylock_type(lock, type))
+	if (!do_six_trylock_type(lock, type, true))
 		return false;

 	if (type != SIX_LOCK_write)
@ -119,8 +279,40 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state old;
-	u64 v = READ_ONCE(lock->state.v);
+	u64 v;

+	EBUG_ON(type == SIX_LOCK_write);
+
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		bool ret;
+
+		preempt_disable();
+		this_cpu_inc(*lock->readers);
+
+		smp_mb();
+
+		old.v = READ_ONCE(lock->state.v);
+		ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		/*
+		 * Similar to the lock path, we may have caused a spurious write
+		 * lock fail and need to issue a wakeup:
+		 */
+		if (old.write_locking) {
+			struct task_struct *p = READ_ONCE(lock->owner);
+
+			if (p)
+				wake_up_process(p);
+		}
+
+		return ret;
+	}
+
+	v = READ_ONCE(lock->state.v);
 	do {
 		old.v = v;

@ -136,14 +328,6 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 	return true;
 }

-struct six_lock_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-};
-
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
 #ifdef CONFIG_LOCK_SPIN_ON_OWNER

 static inline int six_can_spin_on_owner(struct six_lock *lock)
@ -218,7 +402,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 		if (owner && !six_spin_on_owner(lock, owner))
 			break;

-		if (do_six_trylock_type(lock, type)) {
+		if (do_six_trylock_type(lock, type, false)) {
 			osq_unlock(&lock->osq);
 			preempt_enable();
 			return true;
@ -270,18 +454,22 @@ noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
 				    six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
+	union six_lock_state old;
 	struct six_lock_waiter wait;
 	int ret = 0;
-	u64 v;
+
+	if (type == SIX_LOCK_write) {
+		EBUG_ON(lock->state.write_locking);
+		atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+		smp_mb__after_atomic();
+	}

 	ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
 	if (ret)
-		return ret;
+		goto out_before_sleep;

 	if (six_optimistic_spin(lock, type))
-		return 0;
+		goto out_before_sleep;

 	lock_contended(&lock->dep_map, _RET_IP_);

@ -298,32 +486,16 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 			raw_spin_unlock(&lock->wait_lock);
 		}

+		if (do_six_trylock_type(lock, type, false))
+			break;
+
 		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
 		if (ret)
 			break;

-		v = READ_ONCE(lock->state.v);
-		do {
-			new.v = old.v = v;
-
-			if (!(old.v & l[type].lock_fail))
-				new.v += l[type].lock_val;
-			else if (!(new.waiters & (1 << type)))
-				new.waiters |= 1 << type;
-			else
-				break; /* waiting bit already set */
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-					old.v, new.v)) != old.v);
-
-		if (!(old.v & l[type].lock_fail))
-			break;
-
 		schedule();
 	}

-	if (!ret)
-		six_set_owner(lock, type, old);
-
 	__set_current_state(TASK_RUNNING);

 	if (!list_empty_careful(&wait.list)) {
@ -331,6 +503,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 		list_del_init(&wait.list);
 		raw_spin_unlock(&lock->wait_lock);
 	}
+out_before_sleep:
+	if (ret && type == SIX_LOCK_write) {
+		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+					    &lock->state.counter);
+		six_lock_wakeup(lock, old, SIX_LOCK_read);
+	}

 	return ret;
 }
@ -344,7 +522,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 0);

-	ret = do_six_trylock_type(lock, type) ? 0
+	ret = do_six_trylock_type(lock, type, true) ? 0
 		: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);

 	if (ret && type != SIX_LOCK_write)
@ -355,54 +533,12 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
 	return ret;
 }

-static inline void six_lock_wakeup(struct six_lock *lock,
-				   union six_lock_state state,
-				   unsigned waitlist_id)
-{
-	struct list_head *wait_list = &lock->wait_list[waitlist_id];
-	struct six_lock_waiter *w, *next;
-
-	if (waitlist_id == SIX_LOCK_write && state.read_lock)
-		return;
-
-	if (!(state.waiters & (1 << waitlist_id)))
-		return;
-
-	clear_bit(waitlist_bitnr(waitlist_id),
-		  (unsigned long *) &lock->state.v);
-
-	if (waitlist_id == SIX_LOCK_write) {
-		struct task_struct *p = READ_ONCE(lock->owner);
-
-		if (p)
-			wake_up_process(p);
-		return;
-	}
-
-	raw_spin_lock(&lock->wait_lock);
-
-	list_for_each_entry_safe(w, next, wait_list, list) {
-		list_del_init(&w->list);
-
-		if (wake_up_process(w->task) &&
-		    waitlist_id != SIX_LOCK_read) {
-			if (!list_empty(wait_list))
-				set_bit(waitlist_bitnr(waitlist_id),
-					(unsigned long *) &lock->state.v);
-			break;
-		}
-	}
-
-	raw_spin_unlock(&lock->wait_lock);
-}
-
 __always_inline __flatten
 static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state state;

-	EBUG_ON(!(lock->state.v & l[type].held_mask));
 	EBUG_ON(type == SIX_LOCK_write &&
 		!(lock->state.v & __SIX_LOCK_HELD_intent));

@ -420,8 +556,17 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		lock->owner = NULL;
 	}

-	state.v = atomic64_add_return_release(l[type].unlock_val,
-					      &lock->state.counter);
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		smp_mb(); /* unlock barrier */
+		this_cpu_dec(*lock->readers);
+		state.v = READ_ONCE(lock->state.v);
+	} else {
+		EBUG_ON(!(lock->state.v & l[type].held_mask));
+		state.v = atomic64_add_return_release(l[type].unlock_val,
+						      &lock->state.counter);
+	}
+
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }

@ -467,26 +612,28 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);

 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
 	union six_lock_state old, new;
 	u64 v = READ_ONCE(lock->state.v);

 	do {
 		new.v = old.v = v;

-		EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask));
-
-		new.v += l[SIX_LOCK_read].unlock_val;
-
-		if (new.v & l[SIX_LOCK_intent].lock_fail)
+		if (new.intent_lock)
 			return false;

-		new.v += l[SIX_LOCK_intent].lock_val;
+		if (!lock->readers) {
+			EBUG_ON(!new.read_lock);
+			new.read_lock--;
+		}
+
+		new.intent_lock = 1;
 	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
 				old.v, new.v)) != old.v);

+	if (lock->readers)
+		this_cpu_dec(*lock->readers);
+
 	six_set_owner(lock, SIX_LOCK_intent, old);
-	six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup);

 	return true;
 }
@ -518,16 +665,22 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;

-	EBUG_ON(type == SIX_LOCK_write);
 	six_acquire(&lock->dep_map, 0);

 	/* XXX: assert already locked, and that we don't overflow: */

 	switch (type) {
 	case SIX_LOCK_read:
-		atomic64_add(l[type].lock_val, &lock->state.counter);
+		if (lock->readers) {
+			this_cpu_inc(*lock->readers);
+		} else {
+			EBUG_ON(!lock->state.read_lock &&
+				!lock->state.intent_lock);
+			atomic64_add(l[type].lock_val, &lock->state.counter);
+		}
 		break;
 	case SIX_LOCK_intent:
+		EBUG_ON(!lock->state.intent_lock);
 		lock->intent_lock_recurse++;
 		break;
 	case SIX_LOCK_write:
@ -551,3 +704,24 @@ void six_lock_wakeup_all(struct six_lock *lock)
 	raw_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+	BUG_ON(lock->readers && pcpu_read_count(lock));
+	BUG_ON(lock->state.read_lock);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+	BUG_ON(lock->readers && pcpu_read_count(lock));
+	BUG_ON(lock->state.read_lock);
+#ifdef __KERNEL__
+	if (!lock->readers)
+		lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);