Update bcachefs sources to e48731a188 bcachefs: Fix BTREE_TRIGGER_WANTS_OLD_AND_NEW

2025-02-23 00:00:02 +03:00 · 2022-03-13 19:14:01 -04:00 · 2022-03-13 19:14:01 -04:00 · d34e731082
commit d34e731082
parent bf7924f552
36 changed files with 1273 additions and 303 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-e318fabeb424d4b8fdd46329125c30aaa4f9006a
+e48731a188639563444d475622782b7963df4b47
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@ -1,10 +1,447 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd) and
+ * the GPLv2 (found in the COPYING file in the root directory of
+ * https://github.com/facebook/zstd). You may select, at your option, one of the
+ * above-listed licenses.
+ */
+
+#ifndef LINUX_ZSTD_H
+#define LINUX_ZSTD_H
+
+/**
+ * This is a kernel-style API that wraps the upstream zstd API, which cannot be
+ * used directly because the symbols aren't exported. It exposes the minimal
+ * functionality which is currently required by users of zstd in the kernel.
+ * Expose extra functions from lib/zstd/zstd.h as needed.
+ */
+
+/* ======   Dependency   ====== */
+#include <linux/types.h>
 #include <zstd.h>
+#include <linux/zstd_errors.h>

-#define ZSTD_initDCtx(w, s)	ZSTD_initStaticDCtx(w, s)
-#define ZSTD_initCCtx(w, s)	ZSTD_initStaticCCtx(w, s)
+/* ======   Helper Functions   ====== */
+/**
+ * zstd_compress_bound() - maximum compressed size in worst case scenario
+ * @src_size: The size of the data to compress.
+ *
+ * Return:    The maximum compressed size in the worst case scenario.
+ */
+size_t zstd_compress_bound(size_t src_size);

-#define ZSTD_compressCCtx(w, dst, d_len, src, src_len, params)	\
-	ZSTD_compressCCtx(w, dst, d_len, src, src_len, 0)
+/**
+ * zstd_is_error() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+unsigned int zstd_is_error(size_t code);

-#define ZSTD_CCtxWorkspaceBound(p)	ZSTD_estimateCCtxSize(0)
-#define ZSTD_DCtxWorkspaceBound()	ZSTD_estimateDCtxSize()
+/**
+ * enum zstd_error_code - zstd error codes
+ */
+typedef ZSTD_ErrorCode zstd_error_code;
+
+/**
+ * zstd_get_error_code() - translates an error function result to an error code
+ * @code:  The function result for which zstd_is_error(code) is true.
+ *
+ * Return: A unique error code for this error.
+ */
+zstd_error_code zstd_get_error_code(size_t code);
+
+/**
+ * zstd_get_error_name() - translates an error function result to a string
+ * @code:  The function result for which zstd_is_error(code) is true.
+ *
+ * Return: An error string corresponding to the error code.
+ */
+const char *zstd_get_error_name(size_t code);
+
+/**
+ * zstd_min_clevel() - minimum allowed compression level
+ *
+ * Return: The minimum allowed compression level.
+ */
+int zstd_min_clevel(void);
+
+/**
+ * zstd_max_clevel() - maximum allowed compression level
+ *
+ * Return: The maximum allowed compression level.
+ */
+int zstd_max_clevel(void);
+
+/* ======   Parameter Selection   ====== */
+
+/**
+ * enum zstd_strategy - zstd compression search strategy
+ *
+ * From faster to stronger. See zstd_lib.h.
+ */
+typedef ZSTD_strategy zstd_strategy;
+
+/**
+ * struct zstd_compression_parameters - zstd compression parameters
+ * @windowLog:    Log of the largest match distance. Larger means more
+ *                compression, and more memory needed during decompression.
+ * @chainLog:     Fully searched segment. Larger means more compression,
+ *                slower, and more memory (useless for fast).
+ * @hashLog:      Dispatch table. Larger means more compression,
+ *                slower, and more memory.
+ * @searchLog:    Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ *                sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ *                more compression, and slower.
+ * @strategy:     The zstd compression strategy.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_compressionParameters zstd_compression_parameters;
+
+/**
+ * struct zstd_frame_parameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the
+ *                   frame header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the
+ *                   end of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame
+ *                   header when using dictionary compression.
+ *
+ * The default value is all fields set to 0. See zstd_lib.h.
+ */
+typedef ZSTD_frameParameters zstd_frame_parameters;
+
+/**
+ * struct zstd_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef ZSTD_parameters zstd_parameters;
+
+/**
+ * zstd_get_params() - returns zstd_parameters for selected level
+ * @level:              The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ *                      if unknown.
+ *
+ * Return:              The selected zstd_parameters.
+ */
+zstd_parameters zstd_get_params(int level,
+	unsigned long long estimated_src_size);
+
+/* ======   Single-pass Compression   ====== */
+
+typedef ZSTD_CCtx zstd_cctx;
+
+/**
+ * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+ * @parameters: The compression parameters to be used.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * zstd_cctx_workspace_bound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return:      A lower bound on the size of the workspace that is passed to
+ *              zstd_init_cctx().
+ */
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
+
+/**
+ * zstd_init_cctx() - initialize a zstd compression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to
+ *                  determine how large the workspace must be.
+ *
+ * Return:          A zstd compression context or NULL on error.
+ */
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_compress_cctx() - compress src into dst with the initialized parameters
+ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
+ * @dst:          The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ *                ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:          The data to compress.
+ * @src_size:     The size of the data to compress.
+ * @parameters:   The compression parameters to be used.
+ *
+ * Return:        The compressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size, const zstd_parameters *parameters);
+
+/* ======   Single-pass Decompression   ====== */
+
+typedef ZSTD_DCtx zstd_dctx;
+
+/**
+ * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         zstd_init_dctx().
+ */
+size_t zstd_dctx_workspace_bound(void);
+
+/**
+ * zstd_init_dctx() - initialize a zstd decompression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to
+ *                  determine how large the workspace must be.
+ *
+ * Return:          A zstd decompression context or NULL on error.
+ */
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_decompress_dctx() - decompress zstd compressed src into dst
+ * @dctx:         The decompression context.
+ * @dst:          The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ *                as the decompressed size. If the caller cannot upper bound the
+ *                decompressed size, then it's better to use the streaming API.
+ * @src:          The zstd compressed data to decompress. Multiple concatenated
+ *                frames and skippable frames are allowed.
+ * @src_size:     The exact size of the data to decompress.
+ *
+ * Return:        The decompressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size);
+
+/* ======   Streaming Buffers   ====== */
+
+/**
+ * struct zstd_in_buffer - input buffer for streaming
+ * @src:  Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos:  Position where reading stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_inBuffer zstd_in_buffer;
+
+/**
+ * struct zstd_out_buffer - output buffer for streaming
+ * @dst:  Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos:  Position where writing stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_outBuffer zstd_out_buffer;
+
+/* ======   Streaming Compression   ====== */
+
+typedef ZSTD_CStream zstd_cstream;
+
+/**
+ * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream
+ * @cparams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           zstd_init_cstream().
+ */
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
+
+/**
+ * zstd_init_cstream() - initialize a zstd streaming compression context
+ * @parameters        The zstd parameters to use for compression.
+ * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller
+ *                    must pass the source size (zero means empty source).
+ *                    Otherwise, the caller may optionally pass the source
+ *                    size, or zero if unknown.
+ * @workspace:        The workspace to emplace the context into. It must outlive
+ *                    the returned context.
+ * @workspace_size:   The size of workspace.
+ *                    Use zstd_cstream_workspace_bound(params->cparams) to
+ *                    determine how large the workspace must be.
+ *
+ * Return:            The zstd streaming compression context or NULL on error.
+ */
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+	unsigned long long pledged_src_size, void *workspace, size_t workspace_size);
+
+/**
+ * zstd_reset_cstream() - reset the context using parameters from creation
+ * @cstream:          The zstd streaming compression context to reset.
+ * @pledged_src_size: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledged_src_size` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return:            Zero or an error, which can be checked using
+ *                    zstd_is_error().
+ */
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+	unsigned long long pledged_src_size);
+
+/**
+ * zstd_compress_stream() - streaming compress some of input into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ * @input:   Source buffer. `input->pos` is updated to indicate how much data
+ *           was read. Note that it may not consume the entire input, in which
+ *           case `input->pos < input->size`, and it's up to the caller to
+ *           present remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return:   A hint for the number of bytes to use as the input for the next
+ *           function call or an error, which can be checked using
+ *           zstd_is_error().
+ */
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+	zstd_in_buffer *input);
+
+/**
+ * zstd_flush_stream() - flush internal buffers into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ *
+ * zstd_flush_stream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since zstd_flush_stream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
+ */
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/**
+ * zstd_end_stream() - flush internal buffers into output and end the frame
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ *
+ * zstd_end_stream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
+ */
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/* ======   Streaming Decompression   ====== */
+
+typedef ZSTD_DStream zstd_dstream;
+
+/**
+ * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ *
+ * Return:           A lower bound on the size of the workspace that is passed
+ *                   to zstd_init_dstream().
+ */
+size_t zstd_dstream_workspace_bound(size_t max_window_size);
+
+/**
+ * zstd_init_dstream() - initialize a zstd streaming decompression context
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ * @workspace:       The workspace to emplace the context into. It must outlive
+ *                   the returned context.
+ * @workspaceSize:   The size of workspace.
+ *                   Use zstd_dstream_workspace_bound(max_window_size) to
+ *                   determine how large the workspace must be.
+ *
+ * Return:           The zstd streaming decompression context.
+ */
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+	size_t workspace_size);
+
+/**
+ * zstd_reset_dstream() - reset the context using parameters from creation
+ * @dstream: The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return:   Zero or an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_reset_dstream(zstd_dstream *dstream);
+
+/**
+ * zstd_decompress_stream() - streaming decompress some of input into output
+ * @dstream: The zstd streaming decompression context.
+ * @output:  Destination buffer. `output.pos` is updated to indicate how much
+ *           decompressed data was written.
+ * @input:   Source buffer. `input.pos` is updated to indicate how much data was
+ *           read. Note that it may not consume the entire input, in which case
+ *           `input.pos < input.size`, and it's up to the caller to present
+ *           remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * zstd_decompress_stream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return:   Returns 0 iff a frame is completely decoded and fully flushed.
+ *           Otherwise returns a hint for the number of bytes to use as the
+ *           input for the next function call or an error, which can be checked
+ *           using zstd_is_error(). The size hint will never load more than the
+ *           frame.
+ */
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+	zstd_in_buffer *input);
+
+/* ======   Frame Inspection Functions ====== */
+
+/**
+ * zstd_find_frame_compressed_size() - returns the size of a compressed frame
+ * @src:      Source buffer. It should point to the start of a zstd encoded
+ *            frame or a skippable frame.
+ * @src_size: The size of the source buffer. It must be at least as large as the
+ *            size of the frame.
+ *
+ * Return:    The compressed size of the frame pointed to by `src` or an error,
+ *            which can be check with zstd_is_error().
+ *            Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
+
+/**
+ * struct zstd_frame_params - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+ *                    present.
+ * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @blockSizeMax:     The maximum block size.
+ * @frameType:        The frame type (zstd or skippable)
+ * @headerSize:       The size of the frame header.
+ * @dictID:           The dictionary id, or 0 if not present.
+ * @checksumFlag:     Whether a checksum was used.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_frameHeader zstd_frame_header;
+
+/**
+ * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+ * @params:   On success the frame parameters are written here.
+ * @src:      The source buffer. It must point to a zstd or skippable frame.
+ * @src_size: The size of the source buffer.
+ *
+ * Return:    0 on success. If more data is required it returns how many bytes
+ *            must be provided to make forward progress. Otherwise it returns
+ *            an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+	size_t src_size);
+
+#endif  /* LINUX_ZSTD_H */
--- a/include/linux/zstd_errors.h
+++ b/include/linux/zstd_errors.h
@ -0,0 +1,77 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+
+/*===== dependency =====*/
+#include <linux/types.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#define ZSTDERRORLIB_VISIBILITY 
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+
+#endif /* ZSTD_ERRORS_H_398273423 */
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@ -65,16 +65,19 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
 	.key_invalid	= bch2_alloc_v1_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
 }

 #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v2_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
 }

 #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
 	.key_invalid	= bch2_alloc_v3_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
+	.atomic_trigger	= bch2_mark_alloc,		\
 }

 static inline bool bkey_is_alloc(const struct bkey *k)
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@ -6,6 +6,7 @@

 struct bch_fs;
 struct btree;
+struct btree_trans;
 struct bkey;
 enum btree_node_type;

@ -20,6 +21,10 @@ struct bkey_ops {
 	void		(*swab)(struct bkey_s);
 	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+	int		(*trans_trigger)(struct btree_trans *, struct bkey_s_c,
+					 struct bkey_i *, unsigned);
+	int		(*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
+					  struct bkey_s_c, unsigned);
 	void		(*compat)(enum btree_id id, unsigned version,
 				  unsigned big_endian, int write,
 				  struct bkey_s);
@ -57,6 +62,28 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b

 bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);

+static inline int bch2_mark_key(struct btree_trans *trans,
+		  struct bkey_s_c old,
+		  struct bkey_s_c new,
+		  unsigned flags)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+
+	return ops->atomic_trigger
+		? ops->atomic_trigger(trans, old, new, flags)
+		: 0;
+}
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
+			struct bkey_i *new, unsigned flags)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+
+	return ops->trans_trigger
+		? ops->trans_trigger(trans, old, new, flags)
+		: 0;
+}
+
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);

 void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -1626,6 +1626,8 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)

 	if (new & (1U << BTREE_NODE_write_in_flight))
 		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+	else
+		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }

 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@ -2094,7 +2096,6 @@ restart:
 			rcu_read_unlock();
 			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
 			goto restart;
-
 		}
 	rcu_read_unlock();
 }
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -189,7 +189,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
 	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 	    (btree_node_lock_seq_matches(path, b, level) &&
 	     btree_node_lock_increment(trans, b, level, want))) {
-		mark_btree_node_locked(path, level, want);
+		mark_btree_node_locked(trans, path, level, want);
 		return true;
 	}
 fail:
@ -240,7 +240,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,

 	return false;
 success:
-	mark_btree_node_intent_locked(path, level);
+	mark_btree_node_intent_locked(trans, path, level);
 	return true;
 }

@ -486,14 +486,15 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 	 * before interior nodes - now that's handled by
 	 * bch2_btree_path_traverse_all().
 	 */
-	trans_for_each_path(trans, linked)
-		if (linked != path &&
-		    linked->cached == path->cached &&
-		    linked->btree_id == path->btree_id &&
-		    linked->locks_want < new_locks_want) {
-			linked->locks_want = new_locks_want;
-			btree_path_get_locks(trans, linked, true);
-		}
+	if (!path->cached && !trans->in_traverse_all)
+		trans_for_each_path(trans, linked)
+			if (linked != path &&
+			    linked->cached == path->cached &&
+			    linked->btree_id == path->btree_id &&
+			    linked->locks_want < new_locks_want) {
+				linked->locks_want = new_locks_want;
+				btree_path_get_locks(trans, linked, true);
+			}

 	return false;
 }
@ -1167,7 +1168,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 			    t != BTREE_NODE_UNLOCKED) {
 				btree_node_unlock(path, b->c.level);
 				six_lock_increment(&b->c.lock, t);
-				mark_btree_node_locked(path, b->c.level, t);
+				mark_btree_node_locked(trans, path, b->c.level, t);
 			}

 			btree_path_level_init(trans, path, b);
@ -1244,7 +1245,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 				path->l[i].b = NULL;

-			mark_btree_node_locked(path, path->level, lock_type);
+			mark_btree_node_locked(trans, path, path->level, lock_type);
 			btree_path_level_init(trans, path, b);
 			return 0;
 		}
@ -1409,7 +1410,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	if (unlikely(ret))
 		goto err;

-	mark_btree_node_locked(path, level, lock_type);
+	mark_btree_node_locked(trans, path, level, lock_type);
 	btree_path_level_init(trans, path, b);

 	if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
@ -1442,6 +1443,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 	trans->in_traverse_all = true;
 retry_all:
 	trans->restarted = false;
+	trans->traverse_all_idx = U8_MAX;

 	trans_for_each_path(trans, path)
 		path->should_be_locked = false;
@ -1474,9 +1476,9 @@ retry_all:
 	}

 	/* Now, redo traversals in correct order: */
-	i = 0;
-	while (i < trans->nr_sorted) {
-		path = trans->paths + trans->sorted[i];
+	trans->traverse_all_idx = 0;
+	while (trans->traverse_all_idx < trans->nr_sorted) {
+		path = trans->paths + trans->sorted[trans->traverse_all_idx];

 		/*
 		 * Traversing a path can cause another path to be added at about
@ -1484,10 +1486,13 @@ retry_all:
 		 */
 		if (path->uptodate) {
 			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-			if (ret)
+			if (ret == -EINTR || ret == -ENOMEM)
 				goto retry_all;
+			if (ret)
+				goto err;
+			BUG_ON(path->uptodate);
 		} else {
-			i++;
+			trans->traverse_all_idx++;
 		}
 	}

@ -1498,7 +1503,7 @@ retry_all:
 	 */
 	trans_for_each_path(trans, path)
 		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-
+err:
 	bch2_btree_cache_cannibalize_unlock(c);

 	trans->in_traverse_all = false;
@ -1807,18 +1812,48 @@ free:
 	__bch2_path_free(trans, path);
 }

+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	pr_buf(buf, "transaction updates for %s journal seq %llu\n",
+	       trans->fn, trans->journal_res.seq);
+
+	trans_for_each_update(trans, i) {
+		struct bkey_s_c old = { &i->old_k, i->old_v };
+
+		pr_buf(buf, "update: btree %s %pS\n  old ",
+		       bch2_btree_ids[i->btree_id],
+		       (void *) i->ip_allocated);
+
+		bch2_bkey_val_to_text(buf, trans->c, old);
+		pr_buf(buf, "\n  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+		pr_buf(buf, "\n");
+	}
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+	struct printbuf buf = PRINTBUF;
+
+	bch2_trans_updates_to_text(&buf, trans);
+	bch_err(trans->c, "%s", buf.buf);
+	printbuf_exit(&buf);
+}
+
 noinline __cold
 void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 {
 	struct btree_path *path;
-	struct btree_insert_entry *i;
-	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
 	unsigned idx;

 	trans_for_each_path_inorder(trans, path, idx) {
-		printbuf_reset(&buf1);
+		printbuf_reset(&buf);

-		bch2_bpos_to_text(&buf1, path->pos);
+		bch2_bpos_to_text(&buf, path->pos);

 		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
 		       path->idx, path->ref, path->intent_ref,
@ -1826,7 +1861,7 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 		       path->preserve ? " P" : "",
 		       bch2_btree_ids[path->btree_id],
 		       path->level,
-		       buf1.buf,
+		       buf.buf,
 		       path->nodes_locked,
 #ifdef CONFIG_BCACHEFS_DEBUG
 		       (void *) path->ip_allocated
@ -1836,23 +1871,9 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 		       );
 	}

-	trans_for_each_update(trans, i) {
-		struct bkey u;
-		struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+	printbuf_exit(&buf);

-		printbuf_reset(&buf1);
-		printbuf_reset(&buf2);
-		bch2_bkey_val_to_text(&buf1, trans->c, old);
-		bch2_bkey_val_to_text(&buf2, trans->c, bkey_i_to_s_c(i->k));
-
-		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
-		       bch2_btree_ids[i->btree_id],
-		       (void *) i->ip_allocated,
-		       buf1.buf, buf2.buf);
-	}
-
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	bch2_dump_trans_updates(trans);
 }

 static struct btree_path *btree_path_alloc(struct btree_trans *trans,
@ -2337,11 +2358,12 @@ out:
 * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
 * current position
 */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
 	struct btree_trans *trans = iter->trans;
 	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_s_c k;
+	struct bpos iter_pos;
 	int ret;

 	if (iter->update_path) {
@ -2357,6 +2379,24 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		if (!k.k || bkey_err(k))
 			goto out;

+		/*
+		 * iter->pos should be mononotically increasing, and always be
+		 * equal to the key we just returned - except extents can
+		 * straddle iter->pos:
+		 */
+		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+			iter_pos = k.k->p;
+		else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+			iter_pos = bkey_start_pos(k.k);
+		else
+			iter_pos = iter->pos;
+
+		if (bkey_cmp(iter_pos, end) > 0) {
+			bch2_btree_iter_set_pos(iter, end);
+			k = bkey_s_c_null;
+			goto out;
+		}
+
 		if (iter->update_path &&
 		    bkey_cmp(iter->update_path->pos, k.k->p)) {
 			bch2_path_put(trans, iter->update_path,
@ -2387,10 +2427,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 			iter->update_path = bch2_btree_path_set_pos(trans,
 						iter->update_path, pos,
 						iter->flags & BTREE_ITER_INTENT,
-						btree_iter_ip_allocated(iter));
-
-			BUG_ON(!(iter->update_path->nodes_locked & 1));
-			iter->update_path->should_be_locked = true;
+						_THIS_IP_);
 		}

 		/*
@ -2414,14 +2451,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		break;
 	}

-	/*
-	 * iter->pos should be mononotically increasing, and always be equal to
-	 * the key we just returned - except extents can straddle iter->pos:
-	 */
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
-		iter->pos = k.k->p;
-	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-		iter->pos = bkey_start_pos(k.k);
+	iter->pos = iter_pos;

 	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
 				iter->flags & BTREE_ITER_INTENT,
@ -2429,8 +2459,13 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 	BUG_ON(!iter->path->nodes_locked);
 out:
 	if (iter->update_path) {
-		BUG_ON(!(iter->update_path->nodes_locked & 1));
-		iter->update_path->should_be_locked = true;
+		if (iter->update_path->uptodate &&
+		    !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) {
+			k = bkey_s_c_err(-EINTR);
+		} else {
+			BUG_ON(!(iter->update_path->nodes_locked & 1));
+			iter->update_path->should_be_locked = true;
+		}
 	}
 	iter->path->should_be_locked = true;

@ -2661,9 +2696,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)

 		if (iter->flags & BTREE_ITER_INTENT) {
 			struct btree_iter iter2;
+			struct bpos end = iter->pos;
+
+			if (iter->flags & BTREE_ITER_IS_EXTENTS)
+				end.offset = U64_MAX;

 			bch2_trans_copy_iter(&iter2, iter);
-			k = bch2_btree_iter_peek(&iter2);
+			k = bch2_btree_iter_peek_upto(&iter2, end);

 			if (k.k && !bkey_err(k)) {
 				iter->k = iter2.k;
@ -2831,6 +2870,11 @@ static inline void btree_path_list_add(struct btree_trans *trans,

 	path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;

+	if (trans->in_traverse_all &&
+	    trans->traverse_all_idx != U8_MAX &&
+	    trans->traverse_all_idx >= path->sorted_idx)
+		trans->traverse_all_idx++;
+
 	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);

 	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -209,9 +209,14 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 struct btree *bch2_btree_iter_next_node(struct btree_iter *);

-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);

+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);

@ -306,13 +311,26 @@ static inline int bkey_err(struct bkey_s_c k)
 }

 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
-						     unsigned flags)
+							unsigned flags)
 {
 	return flags & BTREE_ITER_SLOTS
 		? bch2_btree_iter_peek_slot(iter)
 		: bch2_btree_iter_peek(iter);
 }

+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+							     struct bpos end,
+							     unsigned flags)
+{
+	if (!(flags & BTREE_ITER_SLOTS))
+		return bch2_btree_iter_peek_upto(iter, end);
+
+	if (bkey_cmp(iter->pos, end) > 0)
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
 	return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
@ -349,6 +367,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 	     bch2_btree_iter_advance(&(_iter)))

+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
+			   _start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
 #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)	\
 	for (;								\
 	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
@ -363,6 +389,8 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,

 /* new multiple iterator interface: */

+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
 void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
 		       unsigned, size_t, const char *);
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@ -309,7 +309,7 @@ retry:
 		if (!ck)
 			goto retry;

-		mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 		path->locks_want = 1;
 	} else {
 		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@ -330,7 +330,7 @@ retry:
 			goto retry;
 		}

-		mark_btree_node_locked(path, 0, lock_want);
+		mark_btree_node_locked(trans, path, 0, lock_want);
 	}

 	path->l[0].lock_seq	= ck->c.lock.state.seq;
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -58,7 +58,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 	path->nodes_intent_locked &= ~(1 << level);
 }

-static inline void mark_btree_node_locked(struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+					  struct btree_path *path,
 					  unsigned level,
 					  enum six_lock_type type)
 {
@ -66,14 +67,17 @@ static inline void mark_btree_node_locked(struct btree_path *path,
 	BUILD_BUG_ON(SIX_LOCK_read   != 0);
 	BUILD_BUG_ON(SIX_LOCK_intent != 1);

+	BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx);
+
 	path->nodes_locked |= 1 << level;
 	path->nodes_intent_locked |= type << level;
 }

-static inline void mark_btree_node_intent_locked(struct btree_path *path,
+static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
+						 struct btree_path *path,
 						 unsigned level)
 {
-	mark_btree_node_locked(path, level, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
 }

 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -361,7 +361,11 @@ struct btree_insert_entry {
 	unsigned long		ip_allocated;
 };

+#ifndef CONFIG_LOCKDEP
 #define BTREE_ITER_MAX		64
+#else
+#define BTREE_ITER_MAX		32
+#endif

 struct btree_trans_commit_hook;
 typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@ -388,6 +392,7 @@ struct btree_trans {

 	u8			nr_sorted;
 	u8			nr_updates;
+	u8			traverse_all_idx;
 	bool			used_mempool:1;
 	bool			in_traverse_all:1;
 	bool			restarted:1;
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -213,7 +213,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 /**
 * btree_insert_key - insert a key one key into a leaf node
 */
-static bool btree_insert_key_leaf(struct btree_trans *trans,
+static void btree_insert_key_leaf(struct btree_trans *trans,
 				  struct btree_insert_entry *insert)
 {
 	struct bch_fs *c = trans->c;
@ -226,7 +226,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,

 	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
 					&insert_l(insert)->iter, insert->k)))
-		return false;
+		return;

 	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
 					 le64_to_cpu(i->journal_seq)));
@ -247,8 +247,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
-
-	return true;
 }

 /* Cached btree updates: */
@ -400,18 +398,16 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
-	bool did_work;

 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));

 	i->k->k.needs_whiteout = false;

-	did_work = !i->cached
-		? btree_insert_key_leaf(trans, i)
-		: bch2_btree_insert_key_cached(trans, i->path, i->k);
-	if (!did_work)
-		return;
+	if (!i->cached)
+		btree_insert_key_leaf(trans, i);
+	else
+		bch2_btree_insert_key_cached(trans, i->path, i->k);

 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		bch2_journal_add_keys(j, &trans->journal_res,
@ -440,7 +436,8 @@ static int run_one_mem_trigger(struct btree_trans *trans,
 	if (!btree_node_type_needs_gc(i->btree_id))
 		return 0;

-	if (old.k->type == new->k.type &&
+	if (bch2_bkey_ops[old.k->type].atomic_trigger ==
+	    bch2_bkey_ops[i->k->k.type].atomic_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
 				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@ -485,7 +482,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_

 	if (overwrite) {
 		ret = bch2_trans_mark_old(trans, old, i->flags);
-	} else if (old.k->type == i->k->k.type &&
+	} else if (bch2_bkey_ops[old.k->type].trans_trigger ==
+		   bch2_bkey_ops[i->k->k.type].trans_trigger &&
 	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
 		i->overwrite_trigger_run = true;
 		ret = bch2_trans_mark_key(trans, old, i->k,
@ -652,6 +650,32 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,

 		if (btree_node_type_needs_gc(i->bkey_type))
 			marking = true;
+
+		/*
+		 * Revalidate before calling mem triggers - XXX, ugly:
+		 *
+		 * - successful btree node splits don't cause transaction
+		 *   restarts and will have invalidated the pointer to the bkey
+		 *   value
+		 * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
+		 *   when it has to resort
+		 * - btree_key_can_insert_cached() when it has to reallocate
+		 *
+		 *   Ugly because we currently have no way to tell if the
+		 *   pointer's been invalidated, which means it's debatabale
+		 *   whether we should be stashing the old key at all.
+		 */
+		i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p);
+
+			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
 	}

 	/*
@ -1217,7 +1241,7 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 			     BTREE_ITER_INTENT|
 			     BTREE_ITER_WITH_UPDATES|
 			     BTREE_ITER_NOT_EXTENTS);
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
 	if ((ret = bkey_err(k)))
 		goto err;
 	if (!k.k)
@ -1369,7 +1393,8 @@ nomerge1:
 			goto out;
 		}
 next:
-		k = bch2_btree_iter_next(&iter);
+		bch2_btree_iter_advance(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
 		if ((ret = bkey_err(k)))
 			goto err;
 		if (!k.k)
@ -1629,14 +1654,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,

 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 				  struct bpos start, struct bpos end,
-				  unsigned iter_flags,
+				  unsigned update_flags,
 				  u64 *journal_seq)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;

-	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
 retry:
 	while ((bch2_trans_begin(trans),
 	       (k = bch2_btree_iter_peek(&iter)).k) &&
@ -1679,7 +1704,8 @@ retry:

 		ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
 			bch2_trans_commit(trans, &disk_res, journal_seq,
-					BTREE_INSERT_NOFAIL);
+					  BTREE_INSERT_NOFAIL|
+					  update_flags);
 		bch2_disk_reservation_put(trans->c, &disk_res);
 		if (ret)
 			break;
@ -1701,10 +1727,10 @@ retry:
 */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 			    struct bpos start, struct bpos end,
-			    unsigned iter_flags,
+			    unsigned update_flags,
 			    u64 *journal_seq)
 {
 	return bch2_trans_do(c, NULL, journal_seq, 0,
 			     bch2_btree_delete_range_trans(&trans, id, start, end,
-							   iter_flags, journal_seq));
+							   update_flags, journal_seq));
 }
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -497,20 +497,25 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }

-static int bch2_mark_alloc(struct btree_trans *trans,
-			   struct bkey_s_c old, struct bkey_s_c new,
-			   unsigned flags)
+int bch2_mark_alloc(struct btree_trans *trans,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
 	struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
 	struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
-	struct bch_dev *ca;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev);
 	struct bucket *g;
 	struct bucket_mark old_m, m;
 	int ret = 0;

+	if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket ||
+				       new_u.bucket >= ca->mi.nbuckets, trans,
+				       "alloc key outside range of device's buckets"))
+		return -EIO;
+
 	/*
 	 * alloc btree is read in by bch2_alloc_read, not gc:
 	 */
@ -550,11 +555,6 @@ static int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}

-	ca = bch_dev_bkey_exists(c, new_u.dev);
-
-	if (new_u.bucket >= ca->mi.nbuckets)
-		return 0;
-
 	percpu_down_read(&c->mark_lock);
 	if (!gc && new_u.gen != old_u.gen)
 		*bucket_gen(ca, new_u.bucket) = new_u.gen;
@ -929,9 +929,9 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
 	return 0;
 }

-static int bch2_mark_extent(struct btree_trans *trans,
-			    struct bkey_s_c old, struct bkey_s_c new,
-			    unsigned flags)
+int bch2_mark_extent(struct btree_trans *trans,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
 {
 	u64 journal_seq = trans->journal_res.seq;
 	struct bch_fs *c = trans->c;
@ -1011,9 +1011,9 @@ static int bch2_mark_extent(struct btree_trans *trans,
 	return 0;
 }

-static int bch2_mark_stripe(struct btree_trans *trans,
-			    struct bkey_s_c old, struct bkey_s_c new,
-			    unsigned flags)
+int bch2_mark_stripe(struct btree_trans *trans,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
 {
 	bool gc = flags & BTREE_TRIGGER_GC;
 	u64 journal_seq = trans->journal_res.seq;
@ -1118,9 +1118,9 @@ static int bch2_mark_stripe(struct btree_trans *trans,
 	return 0;
 }

-static int bch2_mark_inode(struct btree_trans *trans,
-			   struct bkey_s_c old, struct bkey_s_c new,
-			   unsigned flags)
+int bch2_mark_inode(struct btree_trans *trans,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_fs_usage __percpu *fs_usage;
@ -1149,9 +1149,9 @@ static int bch2_mark_inode(struct btree_trans *trans,
 	return 0;
 }

-static int bch2_mark_reservation(struct btree_trans *trans,
-				 struct bkey_s_c old, struct bkey_s_c new,
-				 unsigned flags)
+int bch2_mark_reservation(struct btree_trans *trans,
+			  struct bkey_s_c old, struct bkey_s_c new,
+			  unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@ -1228,9 +1228,9 @@ fsck_err:
 	return ret;
 }

-static int bch2_mark_reflink_p(struct btree_trans *trans,
-			       struct bkey_s_c old, struct bkey_s_c new,
-			       unsigned flags)
+int bch2_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c old, struct bkey_s_c new,
+			unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@ -1267,39 +1267,6 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }

-int bch2_mark_key(struct btree_trans *trans,
-		  struct bkey_s_c old,
-		  struct bkey_s_c new,
-		  unsigned flags)
-{
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
-	switch (k.k->type) {
-	case KEY_TYPE_alloc:
-	case KEY_TYPE_alloc_v2:
-	case KEY_TYPE_alloc_v3:
-		return bch2_mark_alloc(trans, old, new, flags);
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return bch2_mark_extent(trans, old, new, flags);
-	case KEY_TYPE_stripe:
-		return bch2_mark_stripe(trans, old, new, flags);
-	case KEY_TYPE_inode:
-	case KEY_TYPE_inode_v2:
-		return bch2_mark_inode(trans, old, new, flags);
-	case KEY_TYPE_reservation:
-		return bch2_mark_reservation(trans, old, new, flags);
-	case KEY_TYPE_reflink_p:
-		return bch2_mark_reflink_p(trans, old, new, flags);
-	case KEY_TYPE_snapshot:
-		return bch2_mark_snapshot(trans, old, new, flags);
-	default:
-		return 0;
-	}
-}
-
 static noinline __cold
 void fs_usage_apply_warn(struct btree_trans *trans,
 			 unsigned disk_res_sectors,
@ -1462,7 +1429,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 			struct extent_ptr_decoded p,
 			s64 sectors, enum bch_data_type data_type)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bkey_i_stripe *s;
@ -1478,16 +1444,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 		goto err;

 	if (k.k->type != KEY_TYPE_stripe) {
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"pointer to nonexistent stripe %llu",
 			(u64) p.ec.idx);
-		bch2_inconsistent_error(c);
 		ret = -EIO;
 		goto err;
 	}

 	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"stripe pointer doesn't match stripe %llu",
 			(u64) p.ec.idx);
 		ret = -EIO;
@ -1516,10 +1481,14 @@ err:
 	return ret;
 }

-static int bch2_trans_mark_extent(struct btree_trans *trans,
-			struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_extent(struct btree_trans *trans,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
 {
 	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
@ -1601,8 +1570,8 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 		goto err;

 	if (!deleting) {
-		if (bch2_fs_inconsistent_on(u.stripe ||
-					    u.stripe_redundancy, c,
+		if (bch2_trans_inconsistent_on(u.stripe ||
+					    u.stripe_redundancy, trans,
 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
 				iter.pos.inode, iter.pos.offset, u.gen,
 				bch2_data_types[u.data_type],
@ -1612,7 +1581,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 			goto err;
 		}

-		if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+		if (bch2_trans_inconsistent_on(data_type && u.dirty_sectors, trans,
 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
 				iter.pos.inode, iter.pos.offset, u.gen,
 				bch2_data_types[u.data_type],
@ -1625,8 +1594,8 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
 		u.stripe		= s.k->p.offset;
 		u.stripe_redundancy	= s.v->nr_redundant;
 	} else {
-		if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
-					    u.stripe_redundancy != s.v->nr_redundant, c,
+		if (bch2_trans_inconsistent_on(u.stripe != s.k->p.offset ||
+					    u.stripe_redundancy != s.v->nr_redundant, trans,
 				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
 				iter.pos.inode, iter.pos.offset, u.gen,
 				s.k->p.offset, u.stripe)) {
@ -1650,9 +1619,9 @@ err:
 	return ret;
 }

-static int bch2_trans_mark_stripe(struct btree_trans *trans,
-				  struct bkey_s_c old, struct bkey_i *new,
-				  unsigned flags)
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
 {
 	const struct bch_stripe *old_s = NULL;
 	struct bch_stripe *new_s = NULL;
@ -1720,10 +1689,10 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 	return ret;
 }

-static int bch2_trans_mark_inode(struct btree_trans *trans,
-				 struct bkey_s_c old,
-				 struct bkey_i *new,
-				 unsigned flags)
+int bch2_trans_mark_inode(struct btree_trans *trans,
+			  struct bkey_s_c old,
+			  struct bkey_i *new,
+			  unsigned flags)
 {
 	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);

@ -1736,9 +1705,14 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
 	return 0;
 }

-static int bch2_trans_mark_reservation(struct btree_trans *trans,
-				       struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+				struct bkey_s_c old,
+				struct bkey_i *new,
+				unsigned flags)
 {
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 	s64 sectors = (s64) k.k->size;
 	struct replicas_delta_list *d;
@ -1787,7 +1761,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	refcount = bkey_refcount(n);
 	if (!refcount) {
 		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"nonexistent indirect extent at %llu while marking\n  %s",
 			*idx, buf.buf);
 		ret = -EIO;
@ -1796,7 +1770,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,

 	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
 		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		bch2_fs_inconsistent(c,
+		bch2_trans_inconsistent(trans,
 			"indirect extent refcount underflow at %llu while marking\n  %s",
 			*idx, buf.buf);
 		ret = -EIO;
@ -1837,9 +1811,14 @@ err:
 	return ret;
 }

-static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-				     struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			      struct bkey_s_c old,
+			      struct bkey_i *new,
+			      unsigned flags)
 {
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	u64 idx, end_idx;
 	int ret = 0;
@ -1860,33 +1839,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
 	return ret;
 }

-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-			struct bkey_i *new, unsigned flags)
-{
-	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-		? old
-		: bkey_i_to_s_c(new);
-
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return bch2_trans_mark_extent(trans, k, flags);
-	case KEY_TYPE_stripe:
-		return bch2_trans_mark_stripe(trans, old, new, flags);
-	case KEY_TYPE_inode:
-	case KEY_TYPE_inode_v2:
-		return bch2_trans_mark_inode(trans, old, new, flags);
-	case KEY_TYPE_reservation:
-		return bch2_trans_mark_reservation(trans, k, flags);
-	case KEY_TYPE_reflink_p:
-		return bch2_trans_mark_reflink_p(trans, k, flags);
-	default:
-		return 0;
-	}
-}
-
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    struct bch_dev *ca, size_t b,
 				    enum bch_data_type type,
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -229,6 +229,19 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);

+int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+
+int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+
 int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);

 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -197,9 +197,11 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			goto err;

 		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
-		ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());

-		ret = ZSTD_decompressDCtx(ctx,
+		src_len = le32_to_cpup(src_data.b);
+
+		ret = zstd_decompress_dctx(ctx,
 				dst_data,	dst_len,
 				src_data.b + 4, real_src_len);

@ -333,8 +335,8 @@ static int attempt_compress(struct bch_fs *c,
 		return strm.total_out;
 	}
 	case BCH_COMPRESSION_TYPE_zstd: {
-		ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
-			ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+			zstd_cctx_workspace_bound(&c->zstd_params.cParams));

 		/*
 		 * ZSTD requires that when we decompress we pass in the exact
@ -347,11 +349,11 @@ static int attempt_compress(struct bch_fs *c,
 		 * factor (7 bytes) from the dst buffer size to account for
 		 * that.
 		 */
-		size_t len = ZSTD_compressCCtx(ctx,
+		size_t len = zstd_compress_cctx(ctx,
 				dst + 4,	dst_len - 4 - 7,
 				src,		src_len,
-				c->zstd_params);
-		if (ZSTD_isError(len))
+				&c->zstd_params);
+		if (zstd_is_error(len))
 			return 0;

 		*((__le32 *) dst) = cpu_to_le32(len);
@ -546,7 +548,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
-	ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
+	ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
 	struct {
 		unsigned	feature;
 		unsigned	type;
@ -558,8 +560,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
 		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
-			ZSTD_CCtxWorkspaceBound(params.cParams),
-			ZSTD_DCtxWorkspaceBound() },
+			zstd_cctx_workspace_bound(&params.cParams),
+			zstd_dctx_workspace_bound() },
 	}, *i;
 	int ret = 0;

--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@ -470,16 +470,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 	if (ret)
 		return ret;

-	for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(dir.inum, 0, snapshot), 0, k, ret) {
-		if (k.k->p.inode > dir.inum)
-			break;
-
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(dir.inum, 0, snapshot),
+			   POS(dir.inum, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
 			ret = -ENOTEMPTY;
 			break;
 		}
-	}
 	bch2_trans_iter_exit(trans, &iter);

 	return ret;
@ -503,11 +500,9 @@ retry:
 	if (ret)
 		goto err;

-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
-			   SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
-		if (k.k->p.inode > inum.inum)
-			break;
-
+	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+			   SPOS(inum.inum, ctx->pos, snapshot),
+			   POS(inum.inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_dirent)
 			continue;

--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 	.key_invalid	= bch2_stripe_invalid,		\
 	.val_to_text	= bch2_stripe_to_text,		\
 	.swab		= bch2_ptr_swab,		\
+	.trans_trigger	= bch2_trans_mark_stripe,	\
+	.atomic_trigger	= bch2_mark_stripe,		\
 }

 static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@ -66,6 +66,26 @@ do {									\
 	_ret;								\
 })

+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...)				\
+({									\
+	bch_err(trans->c, __VA_ARGS__);					\
+	bch2_inconsistent_error(trans->c);				\
+	bch2_dump_trans_updates(trans);					\
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...)			\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
+	_ret;								\
+})
+
 /*
 * Fsck errors: inconsistency errors we detect at mount time, and should ideally
 * be able to repair:
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -381,6 +381,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 	.key_invalid	= bch2_btree_ptr_invalid,		\
 	.val_to_text	= bch2_btree_ptr_to_text,		\
 	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }

 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
@ -388,6 +390,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }

 /* KEY_TYPE_extent: */
@ -402,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.swab		= bch2_ptr_swab,			\
 	.key_normalize	= bch2_extent_normalize,		\
 	.key_merge	= bch2_extent_merge,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }

 /* KEY_TYPE_reservation: */
@ -414,6 +420,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 	.key_invalid	= bch2_reservation_invalid,		\
 	.val_to_text	= bch2_reservation_to_text,		\
 	.key_merge	= bch2_reservation_merge,		\
+	.trans_trigger	= bch2_trans_mark_reservation,		\
+	.atomic_trigger	= bch2_mark_reservation,		\
 }

 /* Extent checksum entries: */
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -35,6 +35,15 @@
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>

+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+	return false;
+}
+
 static inline struct address_space *faults_disabled_mapping(void)
 {
 	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
@ -1808,11 +1817,11 @@ again:
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
 			bytes = min_t(unsigned long, iov_iter_count(iter),
 				      PAGE_SIZE - offset);

-			if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
 				ret = -EFAULT;
 				break;
 			}
@ -1870,7 +1879,7 @@ static void bch2_dio_read_complete(struct closure *cl)
 {
 	struct dio_read *dio = container_of(cl, struct dio_read, cl);

-	dio->req->ki_complete(dio->req, dio->ret, 0);
+	dio->req->ki_complete(dio->req, dio->ret);
 	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 }

@ -1919,7 +1928,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	iter->count -= shorten;

 	bio = bio_alloc_bioset(GFP_KERNEL,
-			       iov_iter_npages(iter, BIO_MAX_VECS),
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 			       &c->dio_read_bioset);

 	bio->bi_end_io = bch2_direct_IO_read_endio;
@ -1954,7 +1963,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	goto start;
 	while (iter->count) {
 		bio = bio_alloc_bioset(GFP_KERNEL,
-				       iov_iter_npages(iter, BIO_MAX_VECS),
+				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 				       &c->bio_read);
 		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
 start:
@ -2101,7 +2110,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 	while (1) {
 		iter_count = dio->iter.count;

-		if (kthread)
+		if (kthread && dio->mm)
 			kthread_use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;
@ -2111,7 +2120,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		dropped_locks = fdm_dropped_locks();

 		current->faults_disabled_mapping = NULL;
-		if (kthread)
+		if (kthread && dio->mm)
 			kthread_unuse_mm(dio->mm);

 		/*
@ -2244,7 +2253,7 @@ err:
 	inode_dio_end(&inode->v);

 	if (!sync) {
-		req->ki_complete(req, ret, 0);
+		req->ki_complete(req, ret);
 		ret = -EIOCBQUEUED;
 	}
 	return ret;
@ -2304,9 +2313,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	}

 	bio = bio_alloc_bioset(GFP_KERNEL,
-			       iov_iter_is_bvec(iter)
-			       ? 0
-			       : iov_iter_npages(iter, BIO_MAX_VECS),
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 			       &c->dio_write_bioset);
 	dio = container_of(bio, struct dio_write, op.wbio.bio);
 	init_completion(&dio->done);
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -30,6 +30,7 @@
 #include <linux/pagemap.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/seq_file.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/xattr.h>
@ -934,9 +935,8 @@ retry:
 			     SPOS(ei->v.i_ino, start, snapshot), 0);

 	while (!(ret = btree_trans_too_many_iters(&trans)) &&
-	       (k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = bkey_err(k)) &&
-	       bkey_cmp(iter.pos, end) < 0) {
+	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+	       !(ret = bkey_err(k))) {
 		enum btree_id data_btree = BTREE_ID_extents;

 		if (!bkey_extent_is_data(k.k) &&
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -606,12 +606,12 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,

 		bch2_btree_iter_set_snapshot(&iter, snapshot);

-		k = bch2_btree_iter_peek(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
 		ret = bkey_err(k);
 		if (ret)
 			goto err;

-		if (!k.k || iter.pos.inode != inum.inum)
+		if (!k.k)
 			break;

 		bkey_init(&delete.k);
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@ -13,11 +13,15 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_inode (struct bkey_ops) {		\
 	.key_invalid	= bch2_inode_invalid,		\
 	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
 }

 #define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
 	.key_invalid	= bch2_inode_v2_invalid,	\
 	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
 }

 static inline bool bkey_is_inode(const struct bkey *k)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -241,6 +241,9 @@ static int journal_entry_open(struct journal *j)
 	if (u64s <= 0)
 		return cur_entry_journal_full;

+	if (fifo_empty(&j->pin) && j->reclaim_thread)
+		wake_up_process(j->reclaim_thread);
+
 	/*
 	 * The fifo_push() needs to happen at the same time as j->seq is
 	 * incremented for journal_last_seq() to be calculated correctly
@ -628,31 +631,6 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 	return ret ?: ret2 < 0 ? ret2 : 0;
 }

-int bch2_journal_meta(struct journal *j)
-{
-	struct journal_buf *buf;
-	struct journal_res res;
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-
-	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
-	if (ret)
-		return ret;
-
-	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
-	buf->must_flush = true;
-
-	if (!buf->flush_time) {
-		buf->flush_time	= local_clock() ?: 1;
-		buf->expires = jiffies;
-	}
-
-	bch2_journal_res_put(j, &res);
-
-	return bch2_journal_flush_seq(j, res.seq);
-}
-
 /*
 * bch2_journal_flush_async - if there is an open journal entry, or a journal
 * still being written, write it and wait for the write to complete
@ -705,6 +683,64 @@ out:
 	return ret;
 }

+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_buf *buf;
+	struct journal_res res;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	if (ret)
+		return ret;
+
+	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	buf->must_flush = true;
+
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
+{
+	struct jset_entry_log *entry;
+	struct journal_res res = { 0 };
+	unsigned msglen, u64s;
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	msglen = vsnprintf(NULL, 0, fmt, args) + 1;
+	va_end(args);
+
+	u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
+
+	ret = bch2_journal_res_get(j, &res, u64s, 0);
+	if (ret)
+		return ret;
+
+	entry = container_of(journal_res_entry(j, &res),
+			     struct jset_entry_log, entry);;
+	memset(entry, 0, u64s * sizeof(u64));
+	entry->entry.type = BCH_JSET_ENTRY_log;
+	entry->entry.u64s = u64s - 1;
+
+	va_start(args, fmt);
+	vsnprintf(entry->d, INT_MAX, fmt, args);
+	va_end(args);
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
 /* block/unlock the journal: */

 void bch2_journal_unblock(struct journal *j)
@ -802,7 +838,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		 * superblock before inserting into the journal array
 		 */

-		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
+		pos = ja->discard_idx ?: ja->nr;
 		__array_insert_item(ja->buckets,		ja->nr, pos);
 		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
 		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -480,6 +480,7 @@ int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_log_msg(struct journal *, const char *, ...);

 void bch2_journal_halt(struct journal *);

--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -670,6 +670,7 @@ static int bch2_journal_reclaim_thread(void *arg)
 	struct journal *j = arg;
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned long delay, now;
+	bool journal_empty;
 	int ret = 0;

 	set_freezable();
@ -696,10 +697,17 @@ static int bch2_journal_reclaim_thread(void *arg)
 				break;
 			if (j->reclaim_kicked)
 				break;
-			if (time_after_eq(jiffies, j->next_reclaim))
-				break;
-			freezable_schedule_timeout(j->next_reclaim - jiffies);

+			spin_lock(&j->lock);
+			journal_empty = fifo_empty(&j->pin);
+			spin_unlock(&j->lock);
+
+			if (journal_empty)
+				freezable_schedule();
+			else if (time_after(j->next_reclaim, jiffies))
+				freezable_schedule_timeout(j->next_reclaim - jiffies);
+			else
+				break;
 		}
 		__set_current_state(TASK_RUNNING);
 	}
@ -809,10 +817,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	seq = 0;

 	spin_lock(&j->lock);
-	while (!ret && seq < j->pin.back) {
+	while (!ret) {
 		struct bch_replicas_padded replicas;

 		seq = max(seq, journal_last_seq(j));
+		if (seq >= j->pin.back)
+			break;
 		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
 					 journal_seq_pin(j, seq)->devs);
 		seq++;
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -240,6 +240,10 @@ struct journal {
 	spinlock_t		err_lock;

 	struct mutex		reclaim_lock;
+	/*
+	 * Used for waiting until journal reclaim has freed up space in the
+	 * journal:
+	 */
 	wait_queue_head_t	reclaim_wait;
 	struct task_struct	*reclaim_thread;
 	bool			reclaim_kicked;
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -578,6 +578,9 @@ static int bch2_journal_replay(struct bch_fs *c)
 	bch2_journal_set_replay_done(j);
 	bch2_journal_flush_all_pins(j);
 	ret = bch2_journal_error(j);
+
+	if (keys->nr && !ret)
+		bch2_journal_log_msg(&c->journal, "journal replay finished");
 err:
 	kvfree(keys_sorted);
 	return ret;
--- a/libbcachefs/reflink.h
+++ b/libbcachefs/reflink.h
@ -10,7 +10,9 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 #define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
 	.key_invalid	= bch2_reflink_p_invalid,		\
 	.val_to_text	= bch2_reflink_p_to_text,		\
-	.key_merge	= bch2_reflink_p_merge,		\
+	.key_merge	= bch2_reflink_p_merge,			\
+	.trans_trigger	= bch2_trans_mark_reflink_p,		\
+	.atomic_trigger	= bch2_mark_reflink_p,			\
 }

 const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
@ -21,6 +23,8 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 	.key_invalid	= bch2_reflink_v_invalid,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
 	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
 }

 const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@ -163,12 +163,10 @@ bch2_hash_lookup(struct btree_trans *trans,
 	if (ret)
 		return ret;

-	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
 			   BTREE_ITER_SLOTS|flags, k, ret) {
-		if (iter->pos.inode != inum.inum)
-			break;
-
 		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_key(k, key))
 				return 0;
@ -199,15 +197,12 @@ bch2_hash_hole(struct btree_trans *trans,
 	if (ret)
 		return ret;

-	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter->pos.inode != inum.inum)
-			break;
-
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
 		if (!is_visible_key(desc, inum, k))
 			return 0;
-	}
 	bch2_trans_iter_exit(trans, iter);

 	return ret ?: -ENOSPC;
@ -260,14 +255,12 @@ int bch2_hash_set(struct btree_trans *trans,
 	if (ret)
 		return ret;

-	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
 			   SPOS(inum.inum,
 				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
 				snapshot),
+			   POS(inum.inum, U64_MAX),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-		if (iter.pos.inode != inum.inum)
-			break;
-
 		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
 				goto found;
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -607,23 +607,32 @@ STORE(bch2_fs_opts_dir)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
-	int ret, id = opt - bch2_opt_table;
+	int ret = size, id = opt - bch2_opt_table;
 	char *tmp;
 	u64 v;

+	/*
+	 * We don't need to take c->writes for correctness, but it eliminates an
+	 * unsightly error message in the dmesg log when we're RO:
+	 */
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+
 	tmp = kstrdup(buf, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto err;
+	}

 	ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
 	kfree(tmp);

 	if (ret < 0)
-		return ret;
+		goto err;

 	ret = bch2_opt_check_may_set(c, id, v);
 	if (ret < 0)
-		return ret;
+		goto err;

 	bch2_opt_set_sb(c, opt, v);
 	bch2_opt_set_by_id(&c->opts, id, v);
@ -633,8 +642,9 @@ STORE(bch2_fs_opts_dir)
 		bch2_rebalance_add_work(c, S64_MAX);
 		rebalance_wakeup(c);
 	}
-
-	return size;
+err:
+	percpu_ref_put(&c->writes);
+	return ret;
 }
 SYSFS_OPS(bch2_fs_opts_dir);

--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@ -15,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c)
 	int ret;

 	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      POS_MIN, SPOS_MAX,
-				      BTREE_ITER_ALL_SNAPSHOTS,
+				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      0,
 				      NULL);
 	BUG_ON(ret);

 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS_MIN, SPOS_MAX,
-				      BTREE_ITER_ALL_SNAPSHOTS,
-				      NULL);
+				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      0, NULL);
 	BUG_ON(ret);
 }

@ -814,9 +813,8 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 	int ret;

 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS_MIN, SPOS_MAX,
-				      BTREE_ITER_ALL_SNAPSHOTS,
-				      NULL);
+				      SPOS(0, 0, U32_MAX), SPOS_MAX,
+				      0, NULL);
 	if (ret)
 		bch_err(c, "error in seq_delete: %i", ret);
 	return ret;
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@ -372,7 +372,7 @@ static inline void pr_time(struct printbuf *out, u64 _time)
 #ifdef __KERNEL__
 static inline void uuid_unparse_lower(u8 *uuid, char *out)
 {
-	sprintf(out, "%plU", uuid);
+	sprintf(out, "%pUb", uuid);
 }
 #else
 #include <uuid/uuid.h>
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@ -311,13 +311,9 @@ retry:
 	if (ret)
 		goto err;

-	for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
-			   SPOS(inum, offset, snapshot), 0, k, ret) {
-		BUG_ON(k.k->p.inode < inum);
-
-		if (k.k->p.inode > inum)
-			break;
-
+	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(inum, offset, snapshot),
+			   POS(inum, U64_MAX), 0, k, ret) {
 		if (k.k->type != KEY_TYPE_xattr)
 			continue;

--- a/linux/zstd_compress_module.c
+++ b/linux/zstd_compress_module.c
@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+#define ZSTD_FORWARD_IF_ERR(ret)            \
+	do {                                \
+		size_t const __ret = (ret); \
+		if (ZSTD_isError(__ret))    \
+			return __ret;       \
+	} while (0)
+
+static size_t zstd_cctx_init(zstd_cctx *cctx, const zstd_parameters *parameters,
+	unsigned long long pledged_src_size)
+{
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_reset(
+		cctx, ZSTD_reset_session_and_parameters));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setPledgedSrcSize(
+		cctx, pledged_src_size));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_windowLog, parameters->cParams.windowLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_hashLog, parameters->cParams.hashLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_chainLog, parameters->cParams.chainLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_searchLog, parameters->cParams.searchLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_minMatch, parameters->cParams.minMatch));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_targetLength, parameters->cParams.targetLength));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_strategy, parameters->cParams.strategy));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_contentSizeFlag, parameters->fParams.contentSizeFlag));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_checksumFlag, parameters->fParams.checksumFlag));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_dictIDFlag, !parameters->fParams.noDictIDFlag));
+	return 0;
+}
+
+int zstd_min_clevel(void)
+{
+	return ZSTD_minCLevel();
+}
+EXPORT_SYMBOL(zstd_min_clevel);
+
+int zstd_max_clevel(void)
+{
+	return ZSTD_maxCLevel();
+}
+EXPORT_SYMBOL(zstd_max_clevel);
+
+size_t zstd_compress_bound(size_t src_size)
+{
+	return ZSTD_compressBound(src_size);
+}
+EXPORT_SYMBOL(zstd_compress_bound);
+
+zstd_parameters zstd_get_params(int level,
+	unsigned long long estimated_src_size)
+{
+	return ZSTD_getParams(level, estimated_src_size, 0);
+}
+EXPORT_SYMBOL(zstd_get_params);
+
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams)
+{
+	return ZSTD_estimateCCtxSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cctx_workspace_bound);
+
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size)
+{
+	if (workspace == NULL)
+		return NULL;
+	return ZSTD_initStaticCCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_cctx);
+
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size, const zstd_parameters *parameters)
+{
+	ZSTD_FORWARD_IF_ERR(zstd_cctx_init(cctx, parameters, src_size));
+	return ZSTD_compress2(cctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_compress_cctx);
+
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams)
+{
+	return ZSTD_estimateCStreamSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cstream_workspace_bound);
+
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+	unsigned long long pledged_src_size, void *workspace, size_t workspace_size)
+{
+	zstd_cstream *cstream;
+
+	if (workspace == NULL)
+		return NULL;
+
+	cstream = ZSTD_initStaticCStream(workspace, workspace_size);
+	if (cstream == NULL)
+		return NULL;
+
+	/* 0 means unknown in linux zstd API but means 0 in new zstd API */
+	if (pledged_src_size == 0)
+		pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN;
+
+	if (ZSTD_isError(zstd_cctx_init(cstream, parameters, pledged_src_size)))
+		return NULL;
+
+	return cstream;
+}
+EXPORT_SYMBOL(zstd_init_cstream);
+
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+	unsigned long long pledged_src_size)
+{
+	return ZSTD_resetCStream(cstream, pledged_src_size);
+}
+EXPORT_SYMBOL(zstd_reset_cstream);
+
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+	zstd_in_buffer *input)
+{
+	return ZSTD_compressStream(cstream, output, input);
+}
+EXPORT_SYMBOL(zstd_compress_stream);
+
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+	return ZSTD_flushStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_flush_stream);
+
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+	return ZSTD_endStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_end_stream);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
--- a/linux/zstd_decompress_module.c
+++ b/linux/zstd_decompress_module.c
@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+/* Common symbols. zstd_compress must depend on zstd_decompress. */
+
+unsigned int zstd_is_error(size_t code)
+{
+	return ZSTD_isError(code);
+}
+EXPORT_SYMBOL(zstd_is_error);
+
+zstd_error_code zstd_get_error_code(size_t code)
+{
+	return ZSTD_getErrorCode(code);
+}
+EXPORT_SYMBOL(zstd_get_error_code);
+
+const char *zstd_get_error_name(size_t code)
+{
+	return ZSTD_getErrorName(code);
+}
+EXPORT_SYMBOL(zstd_get_error_name);
+
+/* Decompression symbols. */
+
+size_t zstd_dctx_workspace_bound(void)
+{
+	return ZSTD_estimateDCtxSize();
+}
+EXPORT_SYMBOL(zstd_dctx_workspace_bound);
+
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size)
+{
+	if (workspace == NULL)
+		return NULL;
+	return ZSTD_initStaticDCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dctx);
+
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size)
+{
+	return ZSTD_decompressDCtx(dctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_decompress_dctx);
+
+size_t zstd_dstream_workspace_bound(size_t max_window_size)
+{
+	return ZSTD_estimateDStreamSize(max_window_size);
+}
+EXPORT_SYMBOL(zstd_dstream_workspace_bound);
+
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+	size_t workspace_size)
+{
+	if (workspace == NULL)
+		return NULL;
+	(void)max_window_size;
+	return ZSTD_initStaticDStream(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dstream);
+
+size_t zstd_reset_dstream(zstd_dstream *dstream)
+{
+	return ZSTD_resetDStream(dstream);
+}
+EXPORT_SYMBOL(zstd_reset_dstream);
+
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+	zstd_in_buffer *input)
+{
+	return ZSTD_decompressStream(dstream, output, input);
+}
+EXPORT_SYMBOL(zstd_decompress_stream);
+
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size)
+{
+	return ZSTD_findFrameCompressedSize(src, src_size);
+}
+EXPORT_SYMBOL(zstd_find_frame_compressed_size);
+
+size_t zstd_get_frame_header(zstd_frame_header *header, const void *src,
+	size_t src_size)
+{
+	return ZSTD_getFrameHeader(header, src, src_size);
+}
+EXPORT_SYMBOL(zstd_get_frame_header);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");