Update bcachefs sources to 5a0455ae19af bcachefs: bcachefs_metadata_version_snapshot_deletion_v2

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2025-12-08 00:00:12 +03:00 · 2025-05-02 13:39:43 -04:00 · 2025-05-02 13:39:43 -04:00 · 8376810564
commit 8376810564
parent a4babd1d64
33 changed files with 855 additions and 529 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-9b4ab159abcd84cf0c25ee851dda8c40baffecc8
+5a0455ae19afb354634b3c5c9bf55d2171005a2f
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@ -4,14 +4,12 @@

 #include <linux/compiler.h>
 #include <linux/limits.h>
+#include <linux/const.h>

 /*
- * In the fallback code below, we need to compute the minimum and
- * maximum values representable in a given type. These macros may also
- * be useful elsewhere, so we provide them outside the
- * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
- *
- * It would seem more obvious to do something like
+ * We need to compute the minimum and maximum values representable in a given
+ * type. These macros may also be useful elsewhere. It would seem more obvious
+ * to do something like:
 *
 * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
 * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
@ -33,8 +31,10 @@
 * credit to Christian Biere.
 */
 #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
-#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
-#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+#define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_max(t)	__type_max(typeof(t))
+#define __type_min(T) ((T)((T)-type_max(T)-(T)1))
+#define type_min(t)	__type_min(typeof(t))

 /*
 * Avoids triggering -Wtype-limits compilation warning,
@ -53,194 +53,153 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 	return unlikely(overflow);
 }

-#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
-/*
- * For simplicity and code hygiene, the fallback code below insists on
- * a, b and *d having the same type (similar to the min() and max()
- * macros), whereas gcc's type-generic overflow checkers accept
- * different types. Hence we don't just make check_add_overflow an
- * alias for __builtin_add_overflow, but add type checks similar to
- * below.
- */
-#define check_add_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_add_overflow(__a, __b, __d);	\
-}))
-
-#define check_sub_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_sub_overflow(__a, __b, __d);	\
-}))
-
-#define check_mul_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_mul_overflow(__a, __b, __d);	\
-}))
-
-#else
-
-
-/* Checking for unsigned overflow is relatively easy without causing UB. */
-#define __unsigned_add_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = __a + __b;			\
-	*__d < __a;				\
-})
-#define __unsigned_sub_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = __a - __b;			\
-	__a < __b;				\
-})
-/*
- * If one of a or b is a compile-time constant, this avoids a division.
- */
-#define __unsigned_mul_overflow(a, b, d) ({		\
-	typeof(a) __a = (a);				\
-	typeof(b) __b = (b);				\
-	typeof(d) __d = (d);				\
-	(void) (&__a == &__b);				\
-	(void) (&__a == __d);				\
-	*__d = __a * __b;				\
-	__builtin_constant_p(__b) ?			\
-	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
-	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
-})
-
-/*
- * For signed types, detecting overflow is much harder, especially if
- * we want to avoid UB. But the interface of these macros is such that
- * we must provide a result in *d, and in fact we must produce the
- * result promised by gcc's builtins, which is simply the possibly
- * wrapped-around value. Fortunately, we can just formally do the
- * operations in the widest relevant unsigned type (u64) and then
- * truncate the result - gcc is smart enough to generate the same code
- * with and without the (u64) casts.
- */
-
-/*
- * Adding two signed integers can overflow only if they have the same
- * sign, and overflow has happened iff the result has the opposite
- * sign.
- */
-#define __signed_add_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = (u64)__a + (u64)__b;		\
-	(((~(__a ^ __b)) & (*__d ^ __a))	\
-		& type_min(typeof(__a))) != 0;	\
-})
-
-/*
- * Subtraction is similar, except that overflow can now happen only
- * when the signs are opposite. In this case, overflow has happened if
- * the result has the opposite sign of a.
- */
-#define __signed_sub_overflow(a, b, d) ({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	*__d = (u64)__a - (u64)__b;		\
-	((((__a ^ __b)) & (*__d ^ __a))		\
-		& type_min(typeof(__a))) != 0;	\
-})
-
-/*
- * Signed multiplication is rather hard. gcc always follows C99, so
- * division is truncated towards 0. This means that we can write the
- * overflow check like this:
+/**
+ * check_add_overflow() - Calculate addition with overflow checking
+ * @a: first addend
+ * @b: second addend
+ * @d: pointer to store sum
 *
- * (a > 0 && (b > MAX/a || b < MIN/a)) ||
- * (a < -1 && (b > MIN/a || b < MAX/a) ||
- * (a == -1 && b == MIN)
+ * Returns true on wrap-around, false otherwise.
 *
- * The redundant casts of -1 are to silence an annoying -Wtype-limits
- * (included in -Wextra) warning: When the type is u8 or u16, the
- * __b_c_e in check_mul_overflow obviously selects
- * __unsigned_mul_overflow, but unfortunately gcc still parses this
- * code and warns about the limited range of __b.
+ * *@d holds the results of the attempted addition, regardless of whether
+ * wrap-around occurred.
 */
+#define check_add_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_add_overflow(a, b, d))

-#define __signed_mul_overflow(a, b, d) ({				\
-	typeof(a) __a = (a);						\
-	typeof(b) __b = (b);						\
-	typeof(d) __d = (d);						\
-	typeof(a) __tmax = type_max(typeof(a));				\
-	typeof(a) __tmin = type_min(typeof(a));				\
-	(void) (&__a == &__b);						\
-	(void) (&__a == __d);						\
-	*__d = (u64)__a * (u64)__b;					\
-	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
-	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
-	(__b == (typeof(__b))-1 && __a == __tmin);			\
-})
-
-
-#define check_add_overflow(a, b, d)	__must_check_overflow(		\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_add_overflow(a, b, d),			\
-			__unsigned_add_overflow(a, b, d)))
-
-#define check_sub_overflow(a, b, d)	__must_check_overflow(		\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_sub_overflow(a, b, d),			\
-			__unsigned_sub_overflow(a, b, d)))
-
-#define check_mul_overflow(a, b, d)	__must_check_overflow(		\
-	__builtin_choose_expr(is_signed_type(typeof(a)),		\
-			__signed_mul_overflow(a, b, d),			\
-			__unsigned_mul_overflow(a, b, d)))
-
-#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
-
-/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+/**
+ * wrapping_add() - Intentionally perform a wrapping addition
+ * @type: type for result of calculation
+ * @a: first addend
+ * @b: second addend
 *
+ * Return the potentially wrapped-around addition without
+ * tripping any wrap-around sanitizers that may be enabled.
+ */
+#define wrapping_add(type, a, b)				\
+	({							\
+		type __val;					\
+		__builtin_add_overflow(a, b, &__val);		\
+		__val;						\
+	})
+
+/**
+ * wrapping_assign_add() - Intentionally perform a wrapping increment assignment
+ * @var: variable to be incremented
+ * @offset: amount to add
+ *
+ * Increments @var by @offset with wrap-around. Returns the resulting
+ * value of @var. Will not trip any wrap-around sanitizers.
+ *
+ * Returns the new value of @var.
+ */
+#define wrapping_assign_add(var, offset)				\
+	({								\
+		typeof(var) *__ptr = &(var);				\
+		*__ptr = wrapping_add(typeof(var), *__ptr, offset);	\
+	})
+
+/**
+ * check_sub_overflow() - Calculate subtraction with overflow checking
+ * @a: minuend; value to subtract from
+ * @b: subtrahend; value to subtract from @a
+ * @d: pointer to store difference
+ *
+ * Returns true on wrap-around, false otherwise.
+ *
+ * *@d holds the results of the attempted subtraction, regardless of whether
+ * wrap-around occurred.
+ */
+#define check_sub_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_sub_overflow(a, b, d))
+
+/**
+ * wrapping_sub() - Intentionally perform a wrapping subtraction
+ * @type: type for result of calculation
+ * @a: minuend; value to subtract from
+ * @b: subtrahend; value to subtract from @a
+ *
+ * Return the potentially wrapped-around subtraction without
+ * tripping any wrap-around sanitizers that may be enabled.
+ */
+#define wrapping_sub(type, a, b)				\
+	({							\
+		type __val;					\
+		__builtin_sub_overflow(a, b, &__val);		\
+		__val;						\
+	})
+
+/**
+ * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign
+ * @var: variable to be decremented
+ * @offset: amount to subtract
+ *
+ * Decrements @var by @offset with wrap-around. Returns the resulting
+ * value of @var. Will not trip any wrap-around sanitizers.
+ *
+ * Returns the new value of @var.
+ */
+#define wrapping_assign_sub(var, offset)				\
+	({								\
+		typeof(var) *__ptr = &(var);				\
+		*__ptr = wrapping_sub(typeof(var), *__ptr, offset);	\
+	})
+
+/**
+ * check_mul_overflow() - Calculate multiplication with overflow checking
+ * @a: first factor
+ * @b: second factor
+ * @d: pointer to store product
+ *
+ * Returns true on wrap-around, false otherwise.
+ *
+ * *@d holds the results of the attempted multiplication, regardless of whether
+ * wrap-around occurred.
+ */
+#define check_mul_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_mul_overflow(a, b, d))
+
+/**
+ * wrapping_mul() - Intentionally perform a wrapping multiplication
+ * @type: type for result of calculation
+ * @a: first factor
+ * @b: second factor
+ *
+ * Return the potentially wrapped-around multiplication without
+ * tripping any wrap-around sanitizers that may be enabled.
+ */
+#define wrapping_mul(type, a, b)				\
+	({							\
+		type __val;					\
+		__builtin_mul_overflow(a, b, &__val);		\
+		__val;						\
+	})
+
+/**
+ * check_shl_overflow() - Calculate a left-shifted value and check overflow
 * @a: Value to be shifted
 * @s: How many bits left to shift
 * @d: Pointer to where to store the result
 *
 * Computes *@d = (@a << @s)
 *
- * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't
 * make sense. Example conditions:
- * - 'a << s' causes bits to be lost when stored in *d.
- * - 's' is garbage (e.g. negative) or so large that the result of
- *   'a << s' is guaranteed to be 0.
- * - 'a' is negative.
- * - 'a << s' sets the sign bit, if any, in '*d'.
 *
- * '*d' will hold the results of the attempted shift, but is not
- * considered "safe for use" if false is returned.
+ * - '@a << @s' causes bits to be lost when stored in *@d.
+ * - '@s' is garbage (e.g. negative) or so large that the result of
+ *   '@a << @s' is guaranteed to be 0.
+ * - '@a' is negative.
+ * - '@a << @s' sets the sign bit, if any, in '*@d'.
+ *
+ * '*@d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if true is returned.
 */
 #define check_shl_overflow(a, s, d) __must_check_overflow(({		\
 	typeof(a) _a = a;						\
 	typeof(s) _s = s;						\
 	typeof(d) _d = d;						\
-	u64 _a_full = _a;						\
+	unsigned long long _a_full = _a;				\
 	unsigned int _to_shift =					\
 		is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;	\
 	*_d = (_a_full << _to_shift);					\
@ -248,9 +207,115 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 	(*_d >> _to_shift) != _a);					\
 }))

+#define __overflows_type_constexpr(x, T) (			\
+	is_unsigned_type(typeof(x)) ?				\
+		(x) > type_max(T) :				\
+	is_unsigned_type(typeof(T)) ?				\
+		(x) < 0 || (x) > type_max(T) :			\
+	(x) < type_min(T) || (x) > type_max(T))
+
+#define __overflows_type(x, T)		({	\
+	typeof(T) v = 0;			\
+	check_add_overflow((x), v, &v);		\
+})
+
+/**
+ * overflows_type - helper for checking the overflows between value, variables,
+ *		    or data type
+ *
+ * @n: source constant value or variable to be checked
+ * @T: destination variable or data type proposed to store @x
+ *
+ * Compares the @x expression for whether or not it can safely fit in
+ * the storage of the type in @T. @x and @T can have different types.
+ * If @x is a constant expression, this will also resolve to a constant
+ * expression.
+ *
+ * Returns: true if overflow can occur, false otherwise.
+ */
+#define overflows_type(n, T)					\
+	__builtin_choose_expr(__is_constexpr(n),		\
+			      __overflows_type_constexpr(n, T),	\
+			      __overflows_type(n, T))
+
+/**
+ * castable_to_type - like __same_type(), but also allows for casted literals
+ *
+ * @n: variable or constant value
+ * @T: variable or data type
+ *
+ * Unlike the __same_type() macro, this allows a constant value as the
+ * first argument. If this value would not overflow into an assignment
+ * of the second argument's type, it returns true. Otherwise, this falls
+ * back to __same_type().
+ */
+#define castable_to_type(n, T)						\
+	__builtin_choose_expr(__is_constexpr(n),			\
+			      !__overflows_type_constexpr(n, T),	\
+			      __same_type(n, T))
+
+/**
+ * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX
+ * @factor1: first factor
+ * @factor2: second factor
+ *
+ * Returns: calculate @factor1 * @factor2, both promoted to size_t,
+ * with any overflow causing the return value to be SIZE_MAX. The
+ * lvalue must be size_t to avoid implicit type conversion.
+ */
+static inline size_t __must_check size_mul(size_t factor1, size_t factor2)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(factor1, factor2, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * size_add() - Calculate size_t addition with saturation at SIZE_MAX
+ * @addend1: first addend
+ * @addend2: second addend
+ *
+ * Returns: calculate @addend1 + @addend2, both promoted to size_t,
+ * with any overflow causing the return value to be SIZE_MAX. The
+ * lvalue must be size_t to avoid implicit type conversion.
+ */
+static inline size_t __must_check size_add(size_t addend1, size_t addend2)
+{
+	size_t bytes;
+
+	if (check_add_overflow(addend1, addend2, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX
+ * @minuend: value to subtract from
+ * @subtrahend: value to subtract from @minuend
+ *
+ * Returns: calculate @minuend - @subtrahend, both promoted to size_t,
+ * with any overflow causing the return value to be SIZE_MAX. For
+ * composition with the size_add() and size_mul() helpers, neither
+ * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX).
+ * The lvalue must be size_t to avoid implicit type conversion.
+ */
+static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
+{
+	size_t bytes;
+
+	if (minuend == SIZE_MAX || subtrahend == SIZE_MAX ||
+	    check_sub_overflow(minuend, subtrahend, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
 /**
 * array_size() - Calculate size of 2-dimensional array.
- *
 * @a: dimension one
 * @b: dimension two
 *
@ -259,19 +324,10 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 * Returns: number of bytes needed to represent the array or SIZE_MAX on
 * overflow.
 */
-static inline __must_check size_t array_size(size_t a, size_t b)
-{
-	size_t bytes;
-
-	if (check_mul_overflow(a, b, &bytes))
-		return SIZE_MAX;
-
-	return bytes;
-}
+#define array_size(a, b)	size_mul(a, b)

 /**
 * array3_size() - Calculate size of 3-dimensional array.
- *
 * @a: dimension one
 * @b: dimension two
 * @c: dimension three
@ -281,54 +337,11 @@ static inline __must_check size_t array_size(size_t a, size_t b)
 * Returns: number of bytes needed to represent the array or SIZE_MAX on
 * overflow.
 */
-static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
-{
-	size_t bytes;
-
-	if (check_mul_overflow(a, b, &bytes))
-		return SIZE_MAX;
-	if (check_mul_overflow(bytes, c, &bytes))
-		return SIZE_MAX;
-
-	return bytes;
-}
-
-/*
- * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
- * struct_size() below.
- */
-static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
-{
-	size_t bytes;
-
-	if (check_mul_overflow(a, b, &bytes))
-		return SIZE_MAX;
-	if (check_add_overflow(bytes, c, &bytes))
-		return SIZE_MAX;
-
-	return bytes;
-}
-
-/**
- * struct_size() - Calculate size of structure with trailing array.
- * @p: Pointer to the structure.
- * @member: Name of the array member.
- * @count: Number of elements in the array.
- *
- * Calculates size of memory needed for structure @p followed by an
- * array of @count number of @member elements.
- *
- * Return: number of bytes needed or SIZE_MAX on overflow.
- */
-#define struct_size(p, member, count)					\
-	__ab_c_size(count,						\
-		    sizeof(*(p)->member) + __must_be_array((p)->member),\
-		    sizeof(*(p)))
+#define array3_size(a, b, c)	size_mul(size_mul(a, b), c)

 /**
 * flex_array_size() - Calculate size of a flexible array member
 *                     within an enclosing structure.
- *
 * @p: Pointer to the structure.
 * @member: Name of the flexible array member.
 * @count: Number of elements in the array.
@ -339,7 +352,92 @@ static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
 * Return: number of bytes needed or SIZE_MAX on overflow.
 */
 #define flex_array_size(p, member, count)				\
-	array_size(count,						\
-		    sizeof(*(p)->member) + __must_be_array((p)->member))
+	__builtin_choose_expr(__is_constexpr(count),			\
+		(count) * sizeof(*(p)->member) + __must_be_array((p)->member),	\
+		size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member)))
+
+/**
+ * struct_size() - Calculate size of structure with trailing flexible array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure of @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count)					\
+	__builtin_choose_expr(__is_constexpr(count),			\
+		sizeof(*(p)) + flex_array_size(p, member, count),	\
+		size_add(sizeof(*(p)), flex_array_size(p, member, count)))
+
+/**
+ * struct_size_t() - Calculate size of structure with trailing flexible array
+ * @type: structure type name.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @type followed by an
+ * array of @count number of @member elements. Prefer using struct_size()
+ * when possible instead, to keep calculations associated with a specific
+ * instance variable of type @type.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size_t(type, member, count)					\
+	struct_size((type *)NULL, member, count)
+
+/**
+ * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
+ * Enables caller macro to pass (different) initializer.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ * @initializer: initializer expression (could be empty for no init).
+ */
+#define _DEFINE_FLEX(type, name, member, count, initializer...)			\
+	_Static_assert(__builtin_constant_p(count),				\
+		       "onstack flex array members require compile-time const count"); \
+	union {									\
+		u8 bytes[struct_size_t(type, member, count)];			\
+		type obj;							\
+	} name##_u initializer;							\
+	type *name = (type *)&name##_u
+
+/**
+ * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing
+ * flexible array member, when it does not have a __counted_by annotation.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ *
+ * Define a zeroed, on-stack, instance of @type structure with a trailing
+ * flexible array member.
+ * Use __struct_size(@name) to get compile-time size of it afterwards.
+ */
+#define DEFINE_RAW_FLEX(type, name, member, count)	\
+	_DEFINE_FLEX(type, name, member, count, = {})
+
+/**
+ * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing
+ * flexible array member.
+ *
+ * @TYPE: structure type name, including "struct" keyword.
+ * @NAME: Name for a variable to define.
+ * @MEMBER: Name of the array member.
+ * @COUNTER: Name of the __counted_by member.
+ * @COUNT: Number of elements in the array; must be compile-time const.
+ *
+ * Define a zeroed, on-stack, instance of @TYPE structure with a trailing
+ * flexible array member.
+ * Use __struct_size(@NAME) to get compile-time size of it afterwards.
+ */
+#define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT)	\
+	_DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, })

 #endif /* __LINUX_OVERFLOW_H */
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -216,6 +216,7 @@
 #include "recovery_passes_types.h"
 #include "sb-errors_types.h"
 #include "seqmutex.h"
+#include "snapshot_types.h"
 #include "time_stats.h"
 #include "util.h"

@ -709,7 +710,7 @@ struct btree_transaction_stats {
 	unsigned		nr_max_paths;
 	unsigned		journal_entries_size;
 	unsigned		max_mem;
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 	darray_trans_kmalloc_trace trans_kmalloc_trace;
 #endif
 	char			*max_paths_text;
@ -869,7 +870,7 @@ struct bch_fs {
 	struct mutex		snapshot_table_lock;
 	struct rw_semaphore	snapshot_create_lock;

-	struct work_struct	snapshot_delete_work;
+	struct snapshot_delete	snapshot_delete;
 	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
 	snapshot_id_list	snapshots_unlinked;
 	struct mutex		snapshots_unlinked_lock;
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -695,7 +695,8 @@ struct bch_sb_field_ext {
 	x(stripe_backpointers,		BCH_VERSION(1, 22))		\
 	x(stripe_lru,			BCH_VERSION(1, 23))		\
 	x(casefolding,			BCH_VERSION(1, 24))		\
-	x(extent_flags,			BCH_VERSION(1, 25))
+	x(extent_flags,			BCH_VERSION(1, 25))		\
+	x(snapshot_deletion_v2,		BCH_VERSION(1, 26))

 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -3089,7 +3089,7 @@ void bch2_trans_copy_iter(struct btree_trans *trans,
 	dst->key_cache_path = 0;
 }

-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
 				      darray_trans_kmalloc_trace *trace)
 {
@ -3112,7 +3112,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
 	void *p;

 	if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 		struct printbuf buf = PRINTBUF;
 		bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
 		bch2_print_str(c, KERN_ERR, buf.buf);
@ -3127,7 +3127,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
 	struct btree_transaction_stats *s = btree_trans_stats(trans);
 	if (new_bytes > s->max_mem) {
 		mutex_lock(&s->lock);
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 		darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
 		s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
 						trans->trans_kmalloc_trace.nr);
@ -3314,7 +3314,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 	}
 #endif

-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 	trans->trans_kmalloc_trace.nr = 0;
 #endif

@ -3486,6 +3486,8 @@ void bch2_trans_put(struct btree_trans *trans)

 #ifdef CONFIG_BCACHEFS_DEBUG
 	darray_exit(&trans->last_restarted_trace);
+#endif
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 	darray_exit(&trans->trans_kmalloc_trace);
 #endif

@ -3642,7 +3644,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
 	for (s = c->btree_transaction_stats;
 	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
 	     s++) {
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 		darray_exit(&s->trans_kmalloc_trace);
 #endif
 		kfree(s->max_paths_text);
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -543,7 +543,7 @@ void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btre

 void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);

-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
 				      darray_trans_kmalloc_trace *);
 #endif
@ -553,7 +553,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
 static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
 					    unsigned long ip)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 	darray_push(&trans->trans_kmalloc_trace,
 		    ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
 #endif
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -495,7 +495,7 @@ struct btree_trans {
 	void			*mem;
 	unsigned		mem_top;
 	unsigned		mem_bytes;
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 	darray_trans_kmalloc_trace trans_kmalloc_trace;
 #endif

--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@ -760,7 +760,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
 		mutex_lock(&s->lock);

 		prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
 		printbuf_indent_add(&i->buf, 2);
 		bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
 		printbuf_indent_sub(&i->buf, 2);
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@ -692,7 +692,7 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
 		      vfs_d_type(d.v->d_type));
 	if (ret)
 		ctx->pos = d.k->p.offset + 1;
-	return ret;
+	return !ret;
 }

 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
@ -717,7 +717,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 			if (ret2 > 0)
 				continue;

-			ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+			ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));
 		})));

 	bch2_bkey_buf_exit(&sk, c);
--- a/libbcachefs/disk_accounting.c
+++ b/libbcachefs/disk_accounting.c
@ -287,7 +287,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc

 static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
 {
-	struct bch_replicas_padded r;
+	union bch_replicas_padded r;
 	return accounting_to_replicas(&r.e, p)
 		? bch2_mark_replicas(c, &r.e)
 		: 0;
@ -361,7 +361,7 @@ err:
 int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
 			       enum bch_accounting_mode mode)
 {
-	struct bch_replicas_padded r;
+	union bch_replicas_padded r;

 	if (mode != BCH_ACCOUNTING_read &&
 	    accounting_to_replicas(&r.e, a.k->p) &&
@ -425,10 +425,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)

 	percpu_down_read(&c->mark_lock);
 	darray_for_each(acc->k, i) {
-		struct {
+		union {
+			u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
+					       BCH_BKEY_PTRS_MAX)];
 			struct bch_replicas_usage r;
-			u8 pad[BCH_BKEY_PTRS_MAX];
 		} u;
+		u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;

 		if (!accounting_to_replicas(&u.r.r, i->pos))
 			continue;
@ -627,7 +629,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,

 	switch (acc->type) {
 	case BCH_DISK_ACCOUNTING_replicas: {
-		struct bch_replicas_padded r;
+		union bch_replicas_padded r;
 		__accounting_to_replicas(&r.e, acc);

 		for (unsigned i = 0; i < r.e.nr_devs; i++)
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@ -86,35 +86,6 @@ err:
 	return ret;
 }

-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	out->atomic++;
-	rcu_read_lock();
-
-	struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-	if (!g)
-		goto out;
-
-	for (unsigned i = 0; i < g->nr; i++) {
-		if (i)
-			prt_printf(out, " ");
-
-		if (g->entries[i].deleted) {
-			prt_printf(out, "[deleted]");
-			continue;
-		}
-
-		prt_printf(out, "[parent %d devs", g->entries[i].parent);
-		for_each_member_device_rcu(c, ca, &g->entries[i].devs)
-			prt_printf(out, " %s", ca->name);
-		prt_printf(out, "]");
-	}
-
-out:
-	rcu_read_unlock();
-	out->atomic--;
-}
-
 static void bch2_sb_disk_groups_to_text(struct printbuf *out,
 					struct bch_sb *sb,
 					struct bch_sb_field *f)
@ -241,17 +212,14 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
 	case TARGET_DEV:
 		return dev == t.dev;
 	case TARGET_GROUP: {
-		struct bch_disk_groups_cpu *g;
-		const struct bch_devs_mask *m;
-		bool ret;
-
 		rcu_read_lock();
-		g = rcu_dereference(c->disk_groups);
-		m = g && t.group < g->nr && !g->entries[t.group].deleted
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+		const struct bch_devs_mask *m =
+			g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;

-		ret = m ? test_bit(dev, m->d) : false;
+		bool ret = m ? test_bit(dev, m->d) : false;
 		rcu_read_unlock();

 		return ret;
@ -377,54 +345,81 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 	return v;
 }

-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
+				     unsigned v)
 {
-	struct bch_disk_groups_cpu *groups;
-	struct bch_disk_group_cpu *g;
-	unsigned nr = 0;
 	u16 path[32];
-
-	out->atomic++;
-	rcu_read_lock();
-	groups = rcu_dereference(c->disk_groups);
-	if (!groups)
-		goto invalid;
+	unsigned nr = 0;

 	while (1) {
 		if (nr == ARRAY_SIZE(path))
 			goto invalid;

-		if (v >= groups->nr)
+		if (v >= (g ? g->nr : 0))
 			goto invalid;

-		g = groups->entries + v;
+		struct bch_disk_group_cpu *e = g->entries + v;

-		if (g->deleted)
+		if (e->deleted)
 			goto invalid;

 		path[nr++] = v;

-		if (!g->parent)
+		if (!e->parent)
 			break;

-		v = g->parent - 1;
+		v = e->parent - 1;
 	}

 	while (nr) {
-		v = path[--nr];
-		g = groups->entries + v;
+		struct bch_disk_group_cpu *e = g->entries + path[--nr];

-		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+		prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
 		if (nr)
 			prt_printf(out, ".");
 	}
-out:
-	rcu_read_unlock();
-	out->atomic--;
 	return;
 invalid:
 	prt_printf(out, "invalid label %u", v);
-	goto out;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	bch2_printbuf_make_room(out, 4096);
+
+	out->atomic++;
+	rcu_read_lock();
+	struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+	for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
+		prt_printf(out, "%2u: ", i);
+
+		if (g->entries[i].deleted) {
+			prt_printf(out, "[deleted]");
+			goto next;
+		}
+
+		__bch2_disk_path_to_text(out, g, i);
+
+		prt_printf(out, " devs");
+
+		for_each_member_device_rcu(c, ca, &g->entries[i].devs)
+			prt_printf(out, " %s", ca->name);
+next:
+		prt_newline(out);
+	}
+
+	rcu_read_unlock();
+	out->atomic--;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+	out->atomic++;
+	rcu_read_lock();
+	__bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
+	rcu_read_unlock();
+	--out->atomic;
 }

 void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
@ -470,23 +465,22 @@ inval:

 int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
-	struct bch_member *mi;
-	int ret, v = -1;
+	lockdep_assert_held(&c->sb_lock);

-	if (!strlen(name) || !strcmp(name, "none"))
-		return 0;

-	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-	if (v < 0)
-		return v;
+	if (!strlen(name) || !strcmp(name, "none")) {
+		struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+		SET_BCH_MEMBER_GROUP(mi, 0);
+	} else {
+		int v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+		if (v < 0)
+			return v;

-	ret = bch2_sb_disk_groups_to_cpu(c);
-	if (ret)
-		return ret;
+		struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+		SET_BCH_MEMBER_GROUP(mi, v + 1);
+	}

-	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	SET_BCH_MEMBER_GROUP(mi, v + 1);
-	return 0;
+	return bch2_sb_disk_groups_to_cpu(c);
 }

 int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@ -2223,10 +2223,10 @@ void bch2_fs_ec_stop(struct bch_fs *c)

 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
 {
-	bool ret;
+	sched_annotate_sleep();

 	mutex_lock(&c->ec_stripe_new_lock);
-	ret = list_empty(&c->ec_stripe_new_list);
+	bool ret = list_empty(&c->ec_stripe_new_list);
 	mutex_unlock(&c->ec_stripe_new_lock);

 	return ret;
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@ -4,9 +4,10 @@

 #include "bcachefs_format.h"

-struct bch_replicas_padded {
+union bch_replicas_padded {
+	u8				bytes[struct_size_t(struct bch_replicas_entry_v1,
+							    devs, BCH_BKEY_PTRS_MAX)];
 	struct bch_replicas_entry_v1	e;
-	u8				pad[BCH_BKEY_PTRS_MAX];
 };

 struct stripe {
@ -28,7 +29,7 @@ struct gc_stripe {
 	u16			block_sectors[BCH_BKEY_PTRS_MAX];
 	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];

-	struct bch_replicas_padded r;
+	union bch_replicas_padded r;
 };

 #endif /* _BCACHEFS_EC_TYPES_H */
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -147,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c,
 void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 			   struct quota_res *quota_res, s64 sectors)
 {
-	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
-				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-				inode->ei_inode.bi_sectors);
+	if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_log_msg_start(c, &buf);
+		prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+			   inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+			   inode->ei_inode.bi_sectors);
+
+		bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
+		if (print)
+			bch2_print_str(c, KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+
+		if (sectors < 0)
+			sectors = -inode->v.i_blocks;
+		else
+			sectors = 0;
+	}
+
 	inode->v.i_blocks += sectors;

 #ifdef CONFIG_BCACHEFS_QUOTA
@ -244,7 +258,6 @@ out:
 	if (!ret)
 		ret = err;

-	bch_err_fn(c, ret);
 	return ret;
 }

@ -506,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap,
 		goto err;
 	}

-	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
-				!bch2_journal_error(&c->journal), c,
-				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
-				inode->v.i_ino, (u64) inode->v.i_blocks,
-				inode->ei_inode.bi_sectors);
+	if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
+		     !bch2_journal_error(&c->journal))) {
+		struct printbuf buf = PRINTBUF;
+		bch2_log_msg_start(c, &buf);
+		prt_printf(&buf,
+			   "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+			   inode->v.i_ino, (u64) inode->v.i_blocks,
+			   inode->ei_inode.bi_sectors);
+
+		bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
+		if (print)
+			bch2_print_str(c, KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+	}

 	ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -785,12 +785,11 @@ static int ref_visible2(struct bch_fs *c,

 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
-	     (_i)->snapshot <= (_snapshot); _i++)					\
-		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+	     (_i)->inode.bi_snapshot <= (_snapshot); _i++)				\
+		if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))

 struct inode_walker_entry {
 	struct bch_inode_unpacked inode;
-	u32			snapshot;
 	u64			count;
 	u64			i_size;
 };
@ -824,7 +823,6 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
 	return bch2_inode_unpack(inode, &u) ?:
 		darray_push(&w->inodes, ((struct inode_walker_entry) {
 		.inode		= u,
-		.snapshot	= inode.k->p.snapshot,
 	}));
 }

@ -864,47 +862,45 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 }

 static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
+lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
 {
-	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+	struct bch_fs *c = trans->c;

 	struct inode_walker_entry *i;
 	__darray_for_each(w->inodes, i)
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
 			goto found;

 	return NULL;
 found:
-	BUG_ON(k.k->p.snapshot > i->snapshot);
+	BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);

-	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
-		struct inode_walker_entry new = *i;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;

-		new.snapshot	= k.k->p.snapshot;
-		new.count	= 0;
-		new.i_size	= 0;
-
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, k);
-
-		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+	if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
+			trans, snapshot_key_missing_inode_snapshot,
+			 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
 			 "unexpected because we should always update the inode when we update a key in that inode\n"
 			 "%s",
-			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
-		printbuf_exit(&buf);
+			 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
+			 (bch2_bkey_val_to_text(&buf, c, k),
+			  buf.buf))) {
+		struct bch_inode_unpacked new = i->inode;

-		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
-			--i;
+		new.bi_snapshot = k.k->p.snapshot;

-		size_t pos = i - w->inodes.data;
-		int ret = darray_insert_item(&w->inodes, pos, new);
-		if (ret)
-			return ERR_PTR(ret);
-
-		i = w->inodes.data + pos;
+		ret = __bch2_fsck_write_inode(trans, &new) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0) ?:
+			-BCH_ERR_transaction_restart_nested;
+		goto fsck_err;
 	}

+	printbuf_exit(&buf);
 	return i;
+fsck_err:
+	printbuf_exit(&buf);
+	return ERR_PTR(ret);
 }

 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
@ -919,7 +915,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,

 	w->last_pos = k.k->p;

-	return lookup_inode_for_snapshot(trans->c, w, k);
+	return lookup_inode_for_snapshot(trans, w, k);
 }

 static int get_visible_inodes(struct btree_trans *trans,
@ -1496,21 +1492,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
 		if (i->inode.bi_sectors == i->count)
 			continue;

-		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
+		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);

 		if (w->recalculate_sums)
 			i->count = count2;

 		if (i->count != count2) {
 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
-					    w->last_pos.inode, i->snapshot, i->count, count2);
+					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
 			i->count = count2;
 		}

 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
 				trans, inode_i_sectors_wrong,
 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
-				w->last_pos.inode, i->snapshot,
+				w->last_pos.inode, i->inode.bi_snapshot,
 				i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
 			ret = bch2_fsck_write_inode(trans, &i->inode);
@ -1821,20 +1817,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
 		     inode->inodes.data && i >= inode->inodes.data;
 		     --i) {
-			if (i->snapshot > k.k->p.snapshot ||
-			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+			if (i->inode.bi_snapshot > k.k->p.snapshot ||
+			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
 				continue;

 			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
 					!bkey_extent_is_reservation(k),
 					trans, extent_past_end_of_inode,
 					"extent type past end of inode %llu:%u, i_size %llu\n%s",
-					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+					i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 				struct btree_iter iter2;

 				bch2_trans_copy_iter(trans, &iter2, iter);
-				bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot);
+				bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot);
 				ret =   bch2_btree_iter_traverse(trans, &iter2) ?:
 					bch2_btree_delete_at(trans, &iter2,
 						BTREE_UPDATE_internal_snapshot_node);
@ -1856,8 +1852,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
 		     inode->inodes.data && i >= inode->inodes.data;
 		     --i) {
-			if (i->snapshot > k.k->p.snapshot ||
-			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+			if (i->inode.bi_snapshot > k.k->p.snapshot ||
+			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
 				continue;

 			i->count += k.k->size;
@ -1939,13 +1935,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
 		if (i->inode.bi_nlink == i->count)
 			continue;

-		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
 		if (count2 < 0)
 			return count2;

 		if (i->count != count2) {
 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
-					    w->last_pos.inode, i->snapshot, i->count, count2);
+					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
 			i->count = count2;
 			if (i->inode.bi_nlink == i->count)
 				continue;
@ -1954,7 +1950,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
 		if (fsck_err_on(i->inode.bi_nlink != i->count,
 				trans, inode_dir_wrong_nlink,
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
-				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
+				w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
 			ret = bch2_fsck_write_inode(trans, &i->inode);
 			if (ret)
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@ -1172,8 +1172,6 @@ retry_pick:

 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
 		rbio->bounce	= true;
-
-		async_object_list_add(c, rbio, rbio, &rbio->list_idx);
 	} else if (flags & BCH_READ_must_clone) {
 		/*
 		 * Have to clone if there were any splits, due to error
@ -1187,8 +1185,6 @@ retry_pick:
 						 &c->bio_read_split),
 				 orig);
 		rbio->bio.bi_iter = iter;
-
-		async_object_list_add(c, rbio, rbio, &rbio->list_idx);
 	} else {
 		rbio = orig;
 		rbio->bio.bi_iter = iter;
@ -1219,6 +1215,8 @@ retry_pick:
 	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
 	rbio->bio.bi_end_io	= bch2_read_endio;

+	async_object_list_add(c, rbio, rbio, &rbio->list_idx);
+
 	/* XXX: also nvme read recovery level */
 	if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
 		rbio->bio.bi_opf |= REQ_FUA;
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@ -256,10 +256,35 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
 	}

 	if (i_sectors_delta) {
+		s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
+		if (unlikely(bi_sectors + i_sectors_delta < 0)) {
+			struct bch_fs *c = trans->c;
+			struct printbuf buf = PRINTBUF;
+			bch2_log_msg_start(c, &buf);
+			prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
+				   extent_iter->pos.inode, bi_sectors, i_sectors_delta);
+
+			bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
+			if (print)
+				bch2_print_str(c, KERN_ERR, buf.buf);
+			printbuf_exit(&buf);
+
+			if (i_sectors_delta < 0)
+				i_sectors_delta = -bi_sectors;
+			else
+				i_sectors_delta = 0;
+		}
+
 		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
 		inode_update_flags = 0;
 	}

+	/*
+	 * extents, dirents and xattrs updates require that an inode update also
+	 * happens - to ensure that if a key exists in one of those btrees with
+	 * a given snapshot ID an inode is also present - so we may have to skip
+	 * the nojournal optimization:
+	 */
 	if (inode->k.p.snapshot != iter.snapshot) {
 		inode->k.p.snapshot = iter.snapshot;
 		inode_update_flags = 0;
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -1404,7 +1404,7 @@ int bch2_journal_read(struct bch_fs *c,
 	}

 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
-		struct bch_replicas_padded replicas = {
+		union bch_replicas_padded replicas = {
 			.e.data_type = BCH_DATA_journal,
 			.e.nr_devs = 0,
 			.e.nr_required = 1,
@ -1632,7 +1632,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	closure_type(w, struct journal_buf, io);
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_replicas_padded replicas;
+	union bch_replicas_padded replicas;
 	u64 seq = le64_to_cpu(w->data->seq);
 	int err = 0;

@ -1784,7 +1784,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
 					BCH_DEV_WRITE_REF_journal_write);
 		if (!ca) {
 			/* XXX: fix this */
-			bch_err(c, "missing device for journal write\n");
+			bch_err(c, "missing device %u for journal write", ptr->dev);
 			continue;
 		}

@ -2055,7 +2055,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	closure_type(w, struct journal_buf, io);
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_replicas_padded replicas;
+	union bch_replicas_padded replicas;
 	unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]);
 	int ret;

--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -955,7 +955,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	seq = 0;
 	spin_lock(&j->lock);
 	while (!ret) {
-		struct bch_replicas_padded replicas;
+		union bch_replicas_padded replicas;

 		seq = max(seq, journal_last_seq(j));
 		if (seq >= j->pin.back)
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@ -209,6 +209,7 @@ enum bch_fsck_flags {
 	x(subvol_to_missing_root,				188,	0)		\
 	x(subvol_root_wrong_bi_subvol,				189,	FSCK_AUTOFIX)	\
 	x(bkey_in_missing_snapshot,				190,	0)		\
+	x(bkey_in_deleted_snapshot,				315,	0)		\
 	x(inode_pos_inode_nonzero,				191,	0)		\
 	x(inode_pos_blockdev_range,				192,	0)		\
 	x(inode_alloc_cursor_inode_bad,				301,	0)		\
@ -216,6 +217,7 @@ enum bch_fsck_flags {
 	x(inode_str_hash_invalid,				194,	0)		\
 	x(inode_v3_fields_start_bad,				195,	0)		\
 	x(inode_snapshot_mismatch,				196,	0)		\
+	x(snapshot_key_missing_inode_snapshot,			314,	FSCK_AUTOFIX)	\
 	x(inode_unlinked_but_clean,				197,	0)		\
 	x(inode_unlinked_but_nlink_nonzero,			198,	0)		\
 	x(inode_unlinked_and_not_open,				281,	0)		\
@ -236,6 +238,9 @@ enum bch_fsck_flags {
 	x(inode_has_child_snapshots_wrong,			287,	0)		\
 	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\
 	x(inode_journal_seq_in_future,				299,	FSCK_AUTOFIX)	\
+	x(inode_i_sectors_underflow,				312,	FSCK_AUTOFIX)	\
+	x(vfs_inode_i_blocks_underflow,				311,	FSCK_AUTOFIX)	\
+	x(vfs_inode_i_blocks_not_zero_at_truncate,		313,	FSCK_AUTOFIX)	\
 	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
 	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
 	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
@ -320,7 +325,7 @@ enum bch_fsck_flags {
 	x(dirent_stray_data_after_cf_name,			305,	0)		\
 	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\
 	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\
-	x(MAX,							311,	0)
+	x(MAX,							316,	0)

 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
--- a/libbcachefs/sb-members.c
+++ b/libbcachefs/sb-members.c
@ -139,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
 	struct bch_sb_field_members_v1 *mi1;
 	struct bch_sb_field_members_v2 *mi2;

+	if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
+		bch2_sb_field_resize(disk_sb, members_v1, 0);
+		return 0;
+	}
+
 	mi1 = bch2_sb_field_resize(disk_sb, members_v1,
 			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
 				     disk_sb->sb->nr_devices, sizeof(u64)));
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0

 #include "bcachefs.h"
+#include "bbpos.h"
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_key_cache.h"
@ -212,7 +213,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,

 	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
 	       BCH_SNAPSHOT_SUBVOL(s.v),
-	       BCH_SNAPSHOT_DELETED(s.v),
+	       BCH_SNAPSHOT_WILL_DELETE(s.v),
 	       le32_to_cpu(s.v->parent),
 	       le32_to_cpu(s.v->children[0]),
 	       le32_to_cpu(s.v->children[1]),
@ -313,7 +314,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
 	if (new.k->type == KEY_TYPE_snapshot) {
 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);

-		t->live		= true;
+		t->state	= !BCH_SNAPSHOT_DELETED(s.v)
+			? SNAPSHOT_ID_live
+			: SNAPSHOT_ID_deleted;
 		t->parent	= le32_to_cpu(s.v->parent);
 		t->children[0]	= le32_to_cpu(s.v->children[0]);
 		t->children[1]	= le32_to_cpu(s.v->children[1]);
@ -338,7 +341,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
 			__set_bit(parent - id - 1, t->is_ancestor);

-		if (BCH_SNAPSHOT_DELETED(s.v)) {
+		if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
 			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
 				bch2_delete_dead_snapshots_async(c);
@ -710,6 +713,9 @@ static int check_snapshot(struct btree_trans *trans,
 	memset(&s, 0, sizeof(s));
 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));

+	if (BCH_SNAPSHOT_DELETED(&s))
+		return 0;
+
 	id = le32_to_cpu(s.parent);
 	if (id) {
 		ret = bch2_snapshot_lookup(trans, id, &v);
@ -747,7 +753,7 @@ static int check_snapshot(struct btree_trans *trans,
 	}

 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
-		!BCH_SNAPSHOT_DELETED(&s);
+		!BCH_SNAPSHOT_WILL_DELETE(&s);

 	if (should_have_subvol) {
 		id = le32_to_cpu(s.subvol);
@ -997,7 +1003,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
 		snapshot_id_list_to_text(&buf, t);

 		darray_for_each(*t, id) {
-			if (fsck_err_on(!bch2_snapshot_exists(c, *id),
+			if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
 					trans, snapshot_node_missing,
 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
 				if (t->nr > 1) {
@ -1022,22 +1028,38 @@ err:
 	return ret;
 }

-int bch2_check_key_has_snapshot(struct btree_trans *trans,
-				struct btree_iter *iter,
-				struct bkey_s_c k)
+int __bch2_check_key_has_snapshot(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
+	enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);

-	if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
+	/* Snapshot was definitively deleted, this error is marked autofix */
+	if (fsck_err_on(state == SNAPSHOT_ID_deleted,
+			trans, bkey_in_deleted_snapshot,
+			"key in deleted snapshot %s, delete?",
+			(bch2_btree_id_to_text(&buf, iter->btree_id),
+			 prt_char(&buf, ' '),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		ret = bch2_btree_delete_at(trans, iter,
+					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
+
+	/*
+	 * Snapshot missing: we should have caught this with btree_lost_data and
+	 * kicked off reconstruct_snapshots, so if we end up here we have no
+	 * idea what happened:
+	 */
+	if (fsck_err_on(state == SNAPSHOT_ID_empty,
 			trans, bkey_in_missing_snapshot,
 			"key in missing snapshot %s, delete?",
 			(bch2_btree_id_to_text(&buf, iter->btree_id),
 			 prt_char(&buf, ' '),
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 		ret = bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
+					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
 fsck_err:
 	printbuf_exit(&buf);
 	return ret;
@ -1061,10 +1083,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
 	}

 	/* already deleted? */
-	if (BCH_SNAPSHOT_DELETED(&s->v))
+	if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
 		goto err;

-	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
 	s->v.subvol = 0;
 err:
@ -1084,24 +1106,25 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 	struct btree_iter iter, p_iter = {};
 	struct btree_iter c_iter = {};
 	struct btree_iter tree_iter = {};
-	struct bkey_s_c_snapshot s;
 	u32 parent_id, child_id;
 	unsigned i;
 	int ret = 0;

-	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-				     BTREE_ITER_intent, snapshot);
-	ret = bkey_err(s);
+	struct bkey_i_snapshot *s =
+		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+					BTREE_ITER_intent, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
 				"missing snapshot %u", id);

 	if (ret)
 		goto err;

-	BUG_ON(s.v->children[1]);
+	BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
+	BUG_ON(s->v.children[1]);

-	parent_id = le32_to_cpu(s.v->parent);
-	child_id = le32_to_cpu(s.v->children[0]);
+	parent_id = le32_to_cpu(s->v.parent);
+	child_id = le32_to_cpu(s->v.children[0]);

 	if (parent_id) {
 		struct bkey_i_snapshot *parent;
@ -1159,24 +1182,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 		 */
 		struct bkey_i_snapshot_tree *s_t;

-		BUG_ON(s.v->children[1]);
+		BUG_ON(s->v.children[1]);

 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
-				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
 				0, snapshot_tree);
 		ret = PTR_ERR_OR_ZERO(s_t);
 		if (ret)
 			goto err;

-		if (s.v->children[0]) {
-			s_t->v.root_snapshot = s.v->children[0];
+		if (s->v.children[0]) {
+			s_t->v.root_snapshot = s->v.children[0];
 		} else {
 			s_t->k.type = KEY_TYPE_deleted;
 			set_bkey_val_u64s(&s_t->k, 0);
 		}
 	}

-	ret = bch2_btree_delete_at(trans, &iter, 0);
+	if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
+		SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+		s->v.parent		= 0;
+		s->v.children[0]	= 0;
+		s->v.children[1]	= 0;
+		s->v.subvol		= 0;
+		s->v.tree		= 0;
+		s->v.depth		= 0;
+		s->v.skip[0]		= 0;
+		s->v.skip[1]		= 0;
+		s->v.skip[2]		= 0;
+	} else {
+		s->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&s->k, 0);
+	}
 err:
 	bch2_trans_iter_exit(trans, &tree_iter);
 	bch2_trans_iter_exit(trans, &p_iter);
@ -1346,12 +1383,6 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 * that key to snapshot leaf nodes, where we can mutate it
 */

-struct snapshot_interior_delete {
-	u32	id;
-	u32	live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
 {
 	darray_for_each(*l, i)
@ -1385,28 +1416,28 @@ static unsigned __live_child(struct snapshot_table *t, u32 id,
 	return 0;
 }

-static unsigned live_child(struct bch_fs *c, u32 id,
-			   snapshot_id_list *delete_leaves,
-			   interior_delete_list *delete_interior)
+static unsigned live_child(struct bch_fs *c, u32 id)
 {
+	struct snapshot_delete *d = &c->snapshot_delete;
+
 	rcu_read_lock();
 	u32 ret = __live_child(rcu_dereference(c->snapshots), id,
-			       delete_leaves, delete_interior);
+			       &d->delete_leaves, &d->delete_interior);
 	rcu_read_unlock();
 	return ret;
 }

 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 					     struct btree_iter *iter,
-					     struct bkey_s_c k,
-					     snapshot_id_list *delete_leaves,
-					     interior_delete_list *delete_interior)
+					     struct bkey_s_c k)
 {
-	if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
+	struct snapshot_delete *d = &trans->c->snapshot_delete;
+
+	if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
 		return bch2_btree_delete_at(trans, iter,
 					    BTREE_UPDATE_internal_snapshot_node);

-	u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+	u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
 	if (live_child) {
 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
 		int ret = PTR_ERR_OR_ZERO(new);
@ -1437,46 +1468,70 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 	return 0;
 }

+static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_delete *d = &c->snapshot_delete;
+
+	bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
+					 bch2_snapshot_tree(c, iter->pos.snapshot));
+	if (unlikely(ret)) {
+		struct bpos pos = iter->pos;
+		pos.snapshot = 0;
+		if (iter->btree_id != BTREE_ID_inodes)
+			pos.offset = U64_MAX;
+		bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
+	}
+
+	return ret;
+}
+
 /*
 * For a given snapshot, if it doesn't have a subvolume that points to it, and
 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
 * as deleted.
 */
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
-					snapshot_id_list *delete_leaves,
-					interior_delete_list *delete_interior)
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
 {
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;

 	struct bch_fs *c = trans->c;
+	struct snapshot_delete *d = &c->snapshot_delete;
 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 	unsigned live_children = 0;

 	if (BCH_SNAPSHOT_SUBVOL(s.v))
 		return 0;

+	if (BCH_SNAPSHOT_DELETED(s.v))
+		return 0;
+
 	for (unsigned i = 0; i < 2; i++) {
 		u32 child = le32_to_cpu(s.v->children[i]);

 		live_children += child &&
-			!snapshot_list_has_id(delete_leaves, child);
+			!snapshot_list_has_id(&d->delete_leaves, child);
 	}

+	u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
+
 	if (live_children == 0) {
-		return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+		return  snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+			snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
 	} else if (live_children == 1) {
-		struct snapshot_interior_delete d = {
+		struct snapshot_interior_delete n = {
 			.id		= s.k->p.offset,
-			.live_child	= live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+			.live_child	= live_child(c, s.k->p.offset),
 		};

-		if (!d.live_child) {
-			bch_err(c, "error finding live child of snapshot %u", d.id);
+		if (!n.live_child) {
+			bch_err(c, "error finding live child of snapshot %u", n.id);
 			return -EINVAL;
 		}

-		return darray_push(delete_interior, d);
+		return  snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+			darray_push(&d->delete_interior, n);
 	} else {
 		return 0;
 	}
@ -1508,6 +1563,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 	struct bkey_i_snapshot *s;
 	int ret;

+	if (!bch2_snapshot_exists(c, k.k->p.offset))
+		return 0;
+
 	if (k.k->type != KEY_TYPE_snapshot)
 		return 0;

@ -1555,39 +1613,52 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 	return bch2_trans_update(trans, iter, &s->k_i, 0);
 }

+static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
+{
+	prt_printf(out, "deleting from trees");
+	darray_for_each(d->deleting_from_trees, i)
+		prt_printf(out, " %u", *i);
+
+	prt_printf(out, "deleting leaves");
+	darray_for_each(d->delete_leaves, i)
+		prt_printf(out, " %u", *i);
+
+	prt_printf(out, " interior");
+	darray_for_each(d->delete_interior, i)
+		prt_printf(out, " %u->%u", i->id, i->live_child);
+}
+
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
 		return 0;

 	struct btree_trans *trans = bch2_trans_get(c);
-	snapshot_id_list delete_leaves = {};
-	interior_delete_list delete_interior = {};
+	struct snapshot_delete *d = &c->snapshot_delete;
 	int ret = 0;

 	/*
 	 * For every snapshot node: If we have no live children and it's not
 	 * pointed to by a subvolume, delete it:
 	 */
+	mutex_lock(&d->lock);
+	d->running = true;
+	d->pos = BBPOS_MIN;
+
 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
-		check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+		check_should_delete_snapshot(trans, k));
+	mutex_unlock(&d->lock);
 	if (!bch2_err_matches(ret, EROFS))
 		bch_err_msg(c, ret, "walking snapshots");
 	if (ret)
 		goto err;

-	if (!delete_leaves.nr && !delete_interior.nr)
+	if (!d->delete_leaves.nr && !d->delete_interior.nr)
 		goto err;

 	{
 		struct printbuf buf = PRINTBUF;
-		prt_printf(&buf, "deleting leaves");
-		darray_for_each(delete_leaves, i)
-			prt_printf(&buf, " %u", *i);
-
-		prt_printf(&buf, " interior");
-		darray_for_each(delete_interior, i)
-			prt_printf(&buf, " %u->%u", i->id, i->live_child);
+		bch2_snapshot_delete_nodes_to_text(&buf, d);

 		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
 		printbuf_exit(&buf);
@ -1595,19 +1666,25 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			goto err;
 	}

-	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+	for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
 		struct disk_reservation res = { 0 };

-		if (!btree_type_has_snapshots(btree))
+		d->pos.pos = POS_MIN;
+
+		if (!btree_type_has_snapshots(d->pos.btree))
 			continue;

 		ret = for_each_btree_key_commit(trans, iter,
-				btree, POS_MIN,
+				d->pos.btree, POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
-			delete_dead_snapshots_process_key(trans, &iter, k,
-							  &delete_leaves,
-							  &delete_interior));
+				&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			d->pos.pos = iter.pos;
+
+			if (skip_unrelated_snapshot_tree(trans, &iter))
+				continue;
+
+			delete_dead_snapshots_process_key(trans, &iter, k);
+		}));

 		bch2_disk_reservation_put(c, &res);

@ -1617,7 +1694,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			goto err;
 	}

-	darray_for_each(delete_leaves, i) {
+	darray_for_each(d->delete_leaves, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(trans, *i));
 		if (!bch2_err_matches(ret, EROFS))
@ -1634,11 +1711,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
 				  BTREE_ITER_intent, k,
 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
 	if (ret)
 		goto err;

-	darray_for_each(delete_interior, i) {
+	darray_for_each(d->delete_interior, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
 			bch2_snapshot_node_delete(trans, i->id));
 		if (!bch2_err_matches(ret, EROFS))
@ -1647,8 +1724,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			goto err;
 	}
 err:
-	darray_exit(&delete_interior);
-	darray_exit(&delete_leaves);
+	mutex_lock(&d->lock);
+	darray_exit(&d->deleting_from_trees);
+	darray_exit(&d->delete_interior);
+	darray_exit(&d->delete_leaves);
+	d->running = false;
+	mutex_unlock(&d->lock);
 	bch2_trans_put(trans);
 	if (!bch2_err_matches(ret, EROFS))
 		bch_err_fn(c, ret);
@ -1657,7 +1738,7 @@ err:

 void bch2_delete_dead_snapshots_work(struct work_struct *work)
 {
-	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);

 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);

@ -1672,10 +1753,27 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)

 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));

-	if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+	if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work))
 		enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
 }

+void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct snapshot_delete *d = &c->snapshot_delete;
+
+	if (!d->running) {
+		prt_str(out, "(not running)");
+		return;
+	}
+
+	mutex_lock(&d->lock);
+	bch2_snapshot_delete_nodes_to_text(out, d);
+	prt_newline(out);
+	mutex_unlock(&d->lock);
+
+	bch2_bbpos_to_text(out, d->pos);
+}
+
 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 				       enum btree_id id,
 				       struct bpos pos)
@ -1714,7 +1812,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
 		return 0;

 	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
-	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
 	    interior_snapshot_needs_delete(snap))
 		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);

@ -1750,3 +1848,10 @@ void bch2_fs_snapshots_exit(struct bch_fs *c)
 {
 	kvfree(rcu_dereference_protected(c->snapshots, true));
 }
+
+void bch2_fs_snapshots_init_early(struct bch_fs *c)
+{
+	INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
+	mutex_init(&c->snapshot_delete.lock);
+	mutex_init(&c->snapshots_unlinked_lock);
+}
--- a/libbcachefs/snapshot.h
+++ b/libbcachefs/snapshot.h
@ -120,19 +120,24 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
 	return id;
 }

-static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
+static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
 {
 	const struct snapshot_t *s = snapshot_t(c, id);
-	return s ? s->live : 0;
+	return s ? s->state : SNAPSHOT_ID_empty;
+}
+
+static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id);
+	rcu_read_unlock();
+
+	return ret;
 }

 static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
-	bool ret = __bch2_snapshot_exists(c, id);
-	rcu_read_unlock();
-
-	return ret;
+	return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
 }

 static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
@ -241,10 +246,19 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
 int bch2_check_snapshot_trees(struct bch_fs *);
 int bch2_check_snapshots(struct bch_fs *);
 int bch2_reconstruct_snapshots(struct bch_fs *);
-int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bkey_s_c k)
+{
+	return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
+		? 0
+		: __bch2_check_key_has_snapshot(trans, iter, k);
+}

 int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-void bch2_delete_dead_snapshots_work(struct work_struct *);

 int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);

@ -259,7 +273,13 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
 }

+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
+
 int bch2_snapshots_read(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
+void bch2_fs_snapshots_init_early(struct bch_fs *);

 #endif /* _BCACHEFS_SNAPSHOT_H */
--- a/libbcachefs/snapshot_format.h
+++ b/libbcachefs/snapshot_format.h
@ -15,10 +15,10 @@ struct bch_snapshot {
 	bch_le128		btime;
 };

-LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
-
+LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE,	struct bch_snapshot, flags,  0,  1)
 /* True if a subvolume points to this snapshot node: */
 LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  2,  3)

 /*
 * Snapshot trees:
--- a/libbcachefs/snapshot_types.h
+++ b/libbcachefs/snapshot_types.h
@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
+#define _BCACHEFS_SNAPSHOT_TYPES_H
+
+#include "bbpos_types.h"
+#include "darray.h"
+#include "subvolume_types.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP	128
+
+struct snapshot_t {
+	enum snapshot_id_state {
+		SNAPSHOT_ID_empty,
+		SNAPSHOT_ID_live,
+		SNAPSHOT_ID_deleted,
+	}			state;
+	u32			parent;
+	u32			skip[3];
+	u32			depth;
+	u32			children[2];
+	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			tree;
+	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+	struct rcu_head		rcu;
+	size_t			nr;
+#ifndef RUST_BINDGEN
+	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+	struct snapshot_t	s[0];
+#endif
+};
+
+struct snapshot_interior_delete {
+	u32	id;
+	u32	live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+struct snapshot_delete {
+	struct work_struct	work;
+
+	struct mutex		lock;
+	snapshot_id_list	deleting_from_trees;
+	snapshot_id_list	delete_leaves;
+	interior_delete_list	delete_interior;
+
+	bool			running;
+	struct bbpos		pos;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@ -730,8 +730,6 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)

 void bch2_fs_subvolumes_init_early(struct bch_fs *c)
 {
-	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
 	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
 		  bch2_subvolume_wait_for_pagecache_and_delete);
-	mutex_init(&c->snapshots_unlinked_lock);
 }
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@ -77,9 +77,6 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btr
 					_end, _subvolid, _flags, _k, _do);	\
 })

-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-
 int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);

--- a/libbcachefs/subvolume_types.h
+++ b/libbcachefs/subvolume_types.h
@ -2,33 +2,6 @@
 #ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 #define _BCACHEFS_SUBVOLUME_TYPES_H

-#include "darray.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP	128
-
-struct snapshot_t {
-	bool			live;
-	u32			parent;
-	u32			skip[3];
-	u32			depth;
-	u32			children[2];
-	u32			subvol; /* Nonzero only if a subvolume points to this node: */
-	u32			tree;
-	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
-	struct rcu_head		rcu;
-	size_t			nr;
-#ifndef RUST_BINDGEN
-	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
-	struct snapshot_t	s[0];
-#endif
-};
-
 typedef struct {
 	/* we can't have padding in this struct: */
 	u64		subvol;
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -864,6 +864,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 	bch2_fs_quota_init(c);
 	bch2_fs_rebalance_init(c);
 	bch2_fs_sb_errors_init_early(c);
+	bch2_fs_snapshots_init_early(c);
 	bch2_fs_subvolumes_init_early(c);

 	INIT_LIST_HEAD(&c->list);
@ -1488,7 +1489,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
 {
 	ca->dev_idx = dev_idx;
 	__set_bit(ca->dev_idx, ca->self.d);
-	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+	if (!ca->name[0])
+		scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);

 	ca->fs = c;
 	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
@ -1540,6 +1543,11 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;

+	struct printbuf name = PRINTBUF;
+	prt_bdevname(&name, sb->bdev);
+	strscpy(ca->name, name.buf, sizeof(ca->name));
+	printbuf_exit(&name);
+
 	/* Commit: */
 	ca->disk_sb = *sb;
 	memset(sb, 0, sizeof(*sb));
@ -1581,11 +1589,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)

 	bch2_dev_sysfs_online(c, ca);

-	struct printbuf name = PRINTBUF;
-	prt_bdevname(&name, ca->disk_sb.bdev);
-	strscpy(ca->name, name.buf, sizeof(ca->name));
-	printbuf_exit(&name);
-
 	bch2_rebalance_wakeup(c);
 	return 0;
 }
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -147,8 +147,9 @@ write_attribute(trigger_journal_flush);
 write_attribute(trigger_journal_writes);
 write_attribute(trigger_btree_cache_shrink);
 write_attribute(trigger_btree_key_cache_shrink);
-write_attribute(trigger_freelist_wakeup);
 write_attribute(trigger_btree_updates);
+write_attribute(trigger_freelist_wakeup);
+write_attribute(trigger_recalc_capacity);
 read_attribute(gc_gens_pos);
 __sysfs_attribute(read_fua_test, 0400);

@ -199,6 +200,7 @@ read_attribute(copy_gc_wait);

 sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_status);
+read_attribute(snapshot_delete_status);

 read_attribute(new_stripes);

@ -431,6 +433,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_rebalance_status)
 		bch2_rebalance_status_to_text(out, c);

+	if (attr == &sysfs_snapshot_delete_status)
+		bch2_snapshot_delete_status_to_text(out, c);
+
 	/* Debugging: */

 	if (attr == &sysfs_journal_debug)
@ -540,6 +545,12 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_freelist_wakeup)
 		closure_wake_up(&c->freelist_wait);

+	if (attr == &sysfs_trigger_recalc_capacity) {
+		down_read(&c->state_lock);
+		bch2_recalc_capacity(c);
+		up_read(&c->state_lock);
+	}
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@ -571,6 +582,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_btree_write_stats,

 	&sysfs_rebalance_status,
+	&sysfs_snapshot_delete_status,

 	&sysfs_compression_stats,

@ -665,8 +677,9 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_journal_writes,
 	&sysfs_trigger_btree_cache_shrink,
 	&sysfs_trigger_btree_key_cache_shrink,
-	&sysfs_trigger_freelist_wakeup,
 	&sysfs_trigger_btree_updates,
+	&sysfs_trigger_freelist_wakeup,
+	&sysfs_trigger_recalc_capacity,

 	&sysfs_gc_gens_pos,

--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);

 	return bch2_xattr_hash(info,
-		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+		 &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
 }

 static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)

 	return l.v->x_type != r->type ||
 		l.v->x_name_len != r->name.len ||
-		memcmp(l.v->x_name, r->name.name, r->name.len);
+		memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
 }

 static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)

 	return l.v->x_type != r.v->x_type ||
 		l.v->x_name_len != r.v->x_name_len ||
-		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+		memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
 }

 const struct bch_hash_desc bch2_xattr_hash_desc = {
@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
 			 c, xattr_invalid_type,
 			 "invalid type (%u)", xattr.v->x_type);

-	bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+	bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
 			 c, xattr_name_invalid_chars,
 			 "xattr name has invalid characters");
 fsck_err:
@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 	unsigned name_len = xattr.v->x_name_len;
 	unsigned val_len  = le16_to_cpu(xattr.v->x_val_len);
 	unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
-		offsetof(struct bch_xattr, x_name);
+		offsetof(struct bch_xattr, x_name_and_value);

 	val_len  = min_t(int, val_len, max_name_val_bytes - name_len);
 	name_len = min(name_len, max_name_val_bytes);

 	prt_printf(out, "%.*s:%.*s",
-		   name_len, xattr.v->x_name,
+		   name_len, xattr.v->x_name_and_value,
 		   val_len,  (char *) xattr_val(xattr.v));

 	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
 	if (ret)
 		return ret;

+	/*
+	 * Besides the ctime update, extents, dirents and xattrs updates require
+	 * that an inode update also happens - to ensure that if a key exists in
+	 * one of those btrees with a given snapshot ID an inode is also present
+	 */
 	inode_u->bi_ctime = bch2_current_time(c);

 	ret = bch2_inode_write(trans, &inode_iter, inode_u);
@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
 		xattr->v.x_type		= type;
 		xattr->v.x_name_len	= namelen;
 		xattr->v.x_val_len	= cpu_to_le16(size);
-		memcpy(xattr->v.x_name, name, namelen);
+		memcpy(xattr->v.x_name_and_value, name, namelen);
 		memcpy(xattr_val(&xattr->v), value, size);

 		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry,
 	if (!prefix)
 		return 0;

-	return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
+	return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
 }

 static int bch2_xattr_list_bcachefs(struct bch_fs *c,
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

 static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
 {
-	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
 			    name_len + val_len, sizeof(u64));
 }

 #define xattr_val(_xattr)					\
-	((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+	((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)

 struct xattr_search_key {
 	u8		type;
--- a/libbcachefs/xattr_format.h
+++ b/libbcachefs/xattr_format.h
@ -13,7 +13,13 @@ struct bch_xattr {
 	__u8			x_type;
 	__u8			x_name_len;
 	__le16			x_val_len;
-	__u8			x_name[] __counted_by(x_name_len);
+	/*
+	 * x_name contains the name and value counted by
+	 * x_name_len + x_val_len. The introduction of
+	 * __counted_by(x_name_len) previously caused a false positive
+	 * detection of an out of bounds write.
+	 */
+	__u8			x_name_and_value[];
 } __packed __aligned(8);

 #endif /* _BCACHEFS_XATTR_FORMAT_H */