mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-04-11 00:00:03 +03:00
Update bcachefs sources to 717b356d1d bcachefs: Convert journal validation to bkey_invalid_flags
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
caeeba5152
commit
15b24c7327
.bcachefs_revisioncmd_device.ccmd_dump.ccmd_kill_btree_node.c
libbcachefs
alloc_foreground.hbcachefs.hbkey.cbkey.hbkey_methods.hbtree_cache.cbtree_gc.cbtree_gc.hbtree_io.cbtree_iter.cbtree_iter.hbtree_journal_iter.cbtree_journal_iter.hbtree_trans_commit.cbtree_update.cbtree_update_interior.cbuckets.hchecksum.cchecksum.hdisk_groups.cerrcode.hextents.cextents.hfs-io-buffered.cfs-io-buffered.hfs-io-direct.cfs-io-direct.hfs-io-pagecache.cfs-io-pagecache.hfs-io.cfs-io.hfs.cfsck.cinode.cjournal_io.cjournal_io.hjournal_reclaim.cmovinggc.crecovery.crecovery.hsb-clean.csb-clean.hsb-members.csb-members.hsuper-io.csuper-io.hsuper.csuper.h
@ -1 +1 @@
|
||||
5b8c4a1366df20bc043404cb882230ce86296590
|
||||
717b356d1dfdf178ac46e217c81bb710b7e77032
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "libbcachefs/bcachefs_ioctl.h"
|
||||
#include "libbcachefs/errcode.h"
|
||||
#include "libbcachefs/journal.h"
|
||||
#include "libbcachefs/sb-members.h"
|
||||
#include "libbcachefs/super-io.h"
|
||||
#include "cmds.h"
|
||||
#include "libbcachefs.h"
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "libbcachefs/btree_iter.h"
|
||||
#include "libbcachefs/error.h"
|
||||
#include "libbcachefs/extents.h"
|
||||
#include "libbcachefs/sb-members.h"
|
||||
#include "libbcachefs/super.h"
|
||||
|
||||
static void dump_usage(void)
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "libbcachefs/btree_iter.h"
|
||||
#include "libbcachefs/errcode.h"
|
||||
#include "libbcachefs/error.h"
|
||||
#include "libbcachefs/sb-members.h"
|
||||
#include "libbcachefs/super.h"
|
||||
|
||||
static void kill_btree_node_usage(void)
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include "bcachefs.h"
|
||||
#include "alloc_types.h"
|
||||
#include "extents.h"
|
||||
#include "super.h"
|
||||
#include "sb-members.h"
|
||||
|
||||
#include <linux/hash.h>
|
||||
|
||||
|
@ -294,8 +294,8 @@ do { \
|
||||
|
||||
#define bch_err_fn(_c, _ret) \
|
||||
bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
|
||||
#define bch_err_msg(_c, _ret, _msg) \
|
||||
bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
|
||||
#define bch_err_msg(_c, _ret, _msg, ...) \
|
||||
bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret))
|
||||
|
||||
#define bch_verbose(c, fmt, ...) \
|
||||
do { \
|
||||
@ -995,6 +995,7 @@ struct bch_fs {
|
||||
enum bch_recovery_pass curr_recovery_pass;
|
||||
/* bitmap of explicitly enabled recovery passes: */
|
||||
u64 recovery_passes_explicit;
|
||||
u64 recovery_passes_complete;
|
||||
|
||||
/* DEBUG JUNK */
|
||||
struct dentry *fs_debug_dir;
|
||||
@ -1139,22 +1140,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
|
||||
return dev < c->sb.nr_devices && c->devs[dev];
|
||||
}
|
||||
|
||||
/*
|
||||
* For when we need to rewind recovery passes and run a pass we skipped:
|
||||
*/
|
||||
static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
||||
enum bch_recovery_pass pass)
|
||||
{
|
||||
c->recovery_passes_explicit |= BIT_ULL(pass);
|
||||
|
||||
if (c->curr_recovery_pass >= pass) {
|
||||
c->curr_recovery_pass = pass;
|
||||
return -BCH_ERR_restart_recovery;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define BKEY_PADDED_ONSTACK(key, pad) \
|
||||
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
|
||||
|
||||
|
@ -7,14 +7,6 @@
|
||||
#include "bset.h"
|
||||
#include "util.h"
|
||||
|
||||
#undef EBUG_ON
|
||||
|
||||
#ifdef DEBUG_BKEYS
|
||||
#define EBUG_ON(cond) BUG_ON(cond)
|
||||
#else
|
||||
#define EBUG_ON(cond)
|
||||
#endif
|
||||
|
||||
const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
|
||||
|
||||
void bch2_bkey_packed_to_binary_text(struct printbuf *out,
|
||||
@ -184,6 +176,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field)
|
||||
return v + offset;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
|
||||
{
|
||||
unsigned bits = state->format->bits_per_field[field];
|
||||
|
||||
if (bits) {
|
||||
if (bits > state->bits) {
|
||||
bits -= state->bits;
|
||||
/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
|
||||
state->w |= (v >> 1) >> (bits - 1);
|
||||
|
||||
*state->p = state->w;
|
||||
state->p = next_word(state->p);
|
||||
state->w = 0;
|
||||
state->bits = 64;
|
||||
}
|
||||
|
||||
state->bits -= bits;
|
||||
state->w |= v << state->bits;
|
||||
}
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
|
||||
{
|
||||
@ -198,20 +212,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
|
||||
if (fls64(v) > bits)
|
||||
return false;
|
||||
|
||||
if (bits > state->bits) {
|
||||
bits -= state->bits;
|
||||
/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
|
||||
state->w |= (v >> 1) >> (bits - 1);
|
||||
|
||||
*state->p = state->w;
|
||||
state->p = next_word(state->p);
|
||||
state->w = 0;
|
||||
state->bits = 64;
|
||||
}
|
||||
|
||||
state->bits -= bits;
|
||||
state->w |= v << state->bits;
|
||||
|
||||
__set_inc_field(state, field, v);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -380,19 +381,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
|
||||
ret = false;
|
||||
}
|
||||
|
||||
if (bits > state->bits) {
|
||||
bits -= state->bits;
|
||||
state->w |= (v >> 1) >> (bits - 1);
|
||||
|
||||
*state->p = state->w;
|
||||
state->p = next_word(state->p);
|
||||
state->w = 0;
|
||||
state->bits = 64;
|
||||
}
|
||||
|
||||
state->bits -= bits;
|
||||
state->w |= v << state->bits;
|
||||
|
||||
__set_inc_field(state, field, v);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -435,6 +424,24 @@ static bool bkey_packed_successor(struct bkey_packed *out,
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
|
||||
{
|
||||
for (unsigned i = 0; i < f->nr_fields; i++) {
|
||||
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
||||
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
|
||||
u64 packed_max = f->bits_per_field[i]
|
||||
? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
|
||||
: 0;
|
||||
u64 field_offset = le64_to_cpu(f->field_offset[i]);
|
||||
|
||||
if (packed_max + field_offset < packed_max ||
|
||||
packed_max + field_offset > unpacked_max)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -515,7 +522,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
|
||||
|
||||
BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
|
||||
BUG_ON(bkey_packed_successor(&successor, b, *out) &&
|
||||
bkey_cmp_left_packed(b, &successor, &orig) < 0);
|
||||
bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
|
||||
!bkey_format_has_too_big_fields(f));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -604,40 +612,74 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(bch2_bkey_format_validate(&ret));
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char *bch2_bkey_format_validate(struct bkey_format *f)
|
||||
int bch2_bkey_format_invalid(struct bch_fs *c,
|
||||
struct bkey_format *f,
|
||||
enum bkey_invalid_flags flags,
|
||||
struct printbuf *err)
|
||||
{
|
||||
unsigned i, bits = KEY_PACKED_BITS_START;
|
||||
|
||||
if (f->nr_fields != BKEY_NR_FIELDS)
|
||||
return "incorrect number of fields";
|
||||
if (f->nr_fields != BKEY_NR_FIELDS) {
|
||||
prt_printf(err, "incorrect number of fields: got %u, should be %u",
|
||||
f->nr_fields, BKEY_NR_FIELDS);
|
||||
return -BCH_ERR_invalid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that the packed format can't represent fields larger than the
|
||||
* unpacked format:
|
||||
*/
|
||||
for (i = 0; i < f->nr_fields; i++) {
|
||||
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
||||
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
|
||||
u64 packed_max = f->bits_per_field[i]
|
||||
? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
|
||||
: 0;
|
||||
u64 field_offset = le64_to_cpu(f->field_offset[i]);
|
||||
if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
|
||||
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
||||
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
|
||||
u64 packed_max = f->bits_per_field[i]
|
||||
? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
|
||||
: 0;
|
||||
u64 field_offset = le64_to_cpu(f->field_offset[i]);
|
||||
|
||||
if (packed_max + field_offset < packed_max ||
|
||||
packed_max + field_offset > unpacked_max)
|
||||
return "field too large";
|
||||
if (packed_max + field_offset < packed_max ||
|
||||
packed_max + field_offset > unpacked_max) {
|
||||
prt_printf(err, "field %u too large: %llu + %llu > %llu",
|
||||
i, packed_max, field_offset, unpacked_max);
|
||||
return -BCH_ERR_invalid;
|
||||
}
|
||||
}
|
||||
|
||||
bits += f->bits_per_field[i];
|
||||
}
|
||||
|
||||
if (f->key_u64s != DIV_ROUND_UP(bits, 64))
|
||||
return "incorrect key_u64s";
|
||||
if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
|
||||
prt_printf(err, "incorrect key_u64s: got %u, should be %u",
|
||||
f->key_u64s, DIV_ROUND_UP(bits, 64));
|
||||
return -BCH_ERR_invalid;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
|
||||
{
|
||||
prt_printf(out, "u64s %u fields ", f->key_u64s);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
|
||||
if (i)
|
||||
prt_str(out, ", ");
|
||||
prt_printf(out, "%u:%llu",
|
||||
f->bits_per_field[i],
|
||||
le64_to_cpu(f->field_offset[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -9,6 +9,12 @@
|
||||
#include "util.h"
|
||||
#include "vstructs.h"
|
||||
|
||||
enum bkey_invalid_flags {
|
||||
BKEY_INVALID_WRITE = (1U << 0),
|
||||
BKEY_INVALID_COMMIT = (1U << 1),
|
||||
BKEY_INVALID_JOURNAL = (1U << 2),
|
||||
};
|
||||
|
||||
#if 0
|
||||
|
||||
/*
|
||||
@ -769,6 +775,8 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
|
||||
|
||||
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
|
||||
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
|
||||
const char *bch2_bkey_format_validate(struct bkey_format *);
|
||||
int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
|
||||
enum bkey_invalid_flags, struct printbuf *);
|
||||
void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
|
||||
|
||||
#endif /* _BCACHEFS_BKEY_H */
|
||||
|
@ -13,12 +13,6 @@ enum btree_node_type;
|
||||
extern const char * const bch2_bkey_types[];
|
||||
extern const struct bkey_ops bch2_bkey_null_ops;
|
||||
|
||||
enum bkey_invalid_flags {
|
||||
BKEY_INVALID_WRITE = (1U << 0),
|
||||
BKEY_INVALID_COMMIT = (1U << 1),
|
||||
BKEY_INVALID_JOURNAL = (1U << 2),
|
||||
};
|
||||
|
||||
/*
|
||||
* key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
|
||||
* invalid, entire key will be deleted.
|
||||
|
@ -1214,7 +1214,6 @@ wait_on_io:
|
||||
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
const struct btree *b)
|
||||
{
|
||||
const struct bkey_format *f = &b->format;
|
||||
struct bset_stats stats;
|
||||
|
||||
memset(&stats, 0, sizeof(stats));
|
||||
@ -1228,9 +1227,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
prt_printf(out, ":\n"
|
||||
" ptrs: ");
|
||||
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "\n"
|
||||
" format: u64s %u fields %u %u %u %u %u\n"
|
||||
prt_printf(out,
|
||||
" format: ");
|
||||
bch2_bkey_format_to_text(out, &b->format);
|
||||
|
||||
prt_printf(out,
|
||||
" unpack fn len: %u\n"
|
||||
" bytes used %zu/%zu (%zu%% full)\n"
|
||||
" sib u64s: %u, %u (merge threshold %u)\n"
|
||||
@ -1238,12 +1241,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
" nr unpacked keys %u\n"
|
||||
" floats %zu\n"
|
||||
" failed unpacked %zu\n",
|
||||
f->key_u64s,
|
||||
f->bits_per_field[0],
|
||||
f->bits_per_field[1],
|
||||
f->bits_per_field[2],
|
||||
f->bits_per_field[3],
|
||||
f->bits_per_field[4],
|
||||
b->unpack_fn_len,
|
||||
b->nr.live_u64s * sizeof(u64),
|
||||
btree_bytes(c) - sizeof(struct btree_node),
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "alloc_foreground.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "bkey_buf.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_key_cache.h"
|
||||
#include "btree_locking.h"
|
||||
#include "btree_update_interior.h"
|
||||
@ -43,7 +44,7 @@
|
||||
static bool should_restart_for_topology_repair(struct bch_fs *c)
|
||||
{
|
||||
return c->opts.fix_errors != FSCK_FIX_no &&
|
||||
!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
|
||||
!(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
|
||||
}
|
||||
|
||||
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
|
||||
|
@ -2,6 +2,7 @@
|
||||
#ifndef _BCACHEFS_BTREE_GC_H
|
||||
#define _BCACHEFS_BTREE_GC_H
|
||||
|
||||
#include "bkey.h"
|
||||
#include "btree_types.h"
|
||||
|
||||
int bch2_check_topology(struct bch_fs *);
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "journal_seq_blacklist.h"
|
||||
#include "recovery.h"
|
||||
#include "super-io.h"
|
||||
#include "trace.h"
|
||||
|
||||
@ -543,31 +544,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
|
||||
prt_str(out, ": ");
|
||||
}
|
||||
|
||||
enum btree_err_type {
|
||||
/*
|
||||
* We can repair this locally, and we're after the checksum check so
|
||||
* there's no need to try another replica:
|
||||
*/
|
||||
BTREE_ERR_FIXABLE,
|
||||
/*
|
||||
* We can repair this if we have to, but we should try reading another
|
||||
* replica if we can:
|
||||
*/
|
||||
BTREE_ERR_WANT_RETRY,
|
||||
/*
|
||||
* Read another replica if we have one, otherwise consider the whole
|
||||
* node bad:
|
||||
*/
|
||||
BTREE_ERR_MUST_RETRY,
|
||||
BTREE_ERR_BAD_NODE,
|
||||
BTREE_ERR_INCOMPATIBLE,
|
||||
};
|
||||
|
||||
enum btree_validate_ret {
|
||||
BTREE_RETRY_READ = 64,
|
||||
};
|
||||
|
||||
static int __btree_err(enum btree_err_type type,
|
||||
static int __btree_err(int ret,
|
||||
struct bch_fs *c,
|
||||
struct bch_dev *ca,
|
||||
struct btree *b,
|
||||
@ -578,7 +555,6 @@ static int __btree_err(enum btree_err_type type,
|
||||
{
|
||||
struct printbuf out = PRINTBUF;
|
||||
va_list args;
|
||||
int ret = -BCH_ERR_fsck_fix;
|
||||
|
||||
btree_err_msg(&out, c, ca, b, i, b->written, write);
|
||||
|
||||
@ -594,27 +570,26 @@ static int __btree_err(enum btree_err_type type,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!have_retry && type == BTREE_ERR_WANT_RETRY)
|
||||
type = BTREE_ERR_FIXABLE;
|
||||
if (!have_retry && type == BTREE_ERR_MUST_RETRY)
|
||||
type = BTREE_ERR_BAD_NODE;
|
||||
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
|
||||
ret = -BCH_ERR_btree_node_read_err_fixable;
|
||||
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
|
||||
ret = -BCH_ERR_btree_node_read_err_bad_node;
|
||||
|
||||
switch (type) {
|
||||
case BTREE_ERR_FIXABLE:
|
||||
switch (ret) {
|
||||
case -BCH_ERR_btree_node_read_err_fixable:
|
||||
mustfix_fsck_err(c, "%s", out.buf);
|
||||
ret = -BCH_ERR_fsck_fix;
|
||||
break;
|
||||
case BTREE_ERR_WANT_RETRY:
|
||||
case BTREE_ERR_MUST_RETRY:
|
||||
case -BCH_ERR_btree_node_read_err_want_retry:
|
||||
case -BCH_ERR_btree_node_read_err_must_retry:
|
||||
bch2_print_string_as_lines(KERN_ERR, out.buf);
|
||||
ret = BTREE_RETRY_READ;
|
||||
break;
|
||||
case BTREE_ERR_BAD_NODE:
|
||||
case -BCH_ERR_btree_node_read_err_bad_node:
|
||||
bch2_print_string_as_lines(KERN_ERR, out.buf);
|
||||
bch2_topology_error(c);
|
||||
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
|
||||
break;
|
||||
case BTREE_ERR_INCOMPATIBLE:
|
||||
case -BCH_ERR_btree_node_read_err_incompatible:
|
||||
bch2_print_string_as_lines(KERN_ERR, out.buf);
|
||||
ret = -BCH_ERR_fsck_errors_not_fixed;
|
||||
break;
|
||||
@ -631,8 +606,11 @@ fsck_err:
|
||||
({ \
|
||||
int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
|
||||
\
|
||||
if (_ret != -BCH_ERR_fsck_fix) \
|
||||
if (_ret != -BCH_ERR_fsck_fix) { \
|
||||
ret = _ret; \
|
||||
goto fsck_err; \
|
||||
} \
|
||||
\
|
||||
*saw_error = true; \
|
||||
})
|
||||
|
||||
@ -696,19 +674,18 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
int write, bool have_retry, bool *saw_error)
|
||||
{
|
||||
unsigned version = le16_to_cpu(i->version);
|
||||
const char *err;
|
||||
struct printbuf buf1 = PRINTBUF;
|
||||
struct printbuf buf2 = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
btree_err_on(!bch2_version_compatible(version),
|
||||
BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
|
||||
"unsupported bset version %u.%u",
|
||||
BCH_VERSION_MAJOR(version),
|
||||
BCH_VERSION_MINOR(version));
|
||||
|
||||
if (btree_err_on(version < c->sb.version_min,
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
|
||||
"bset version %u older than superblock version_min %u",
|
||||
version, c->sb.version_min)) {
|
||||
mutex_lock(&c->sb_lock);
|
||||
@ -719,7 +696,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
if (btree_err_on(BCH_VERSION_MAJOR(version) >
|
||||
BCH_VERSION_MAJOR(c->sb.version),
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
|
||||
"bset version %u newer than superblock version %u",
|
||||
version, c->sb.version)) {
|
||||
mutex_lock(&c->sb_lock);
|
||||
@ -729,11 +706,11 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
|
||||
BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
|
||||
"BSET_SEPARATE_WHITEOUTS no longer supported");
|
||||
|
||||
if (btree_err_on(offset + sectors > btree_sectors(c),
|
||||
BTREE_ERR_FIXABLE, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
|
||||
"bset past end of btree node")) {
|
||||
i->u64s = 0;
|
||||
ret = 0;
|
||||
@ -741,12 +718,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
btree_err_on(offset && !i->u64s,
|
||||
BTREE_ERR_FIXABLE, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
|
||||
"empty bset");
|
||||
|
||||
btree_err_on(BSET_OFFSET(i) &&
|
||||
BSET_OFFSET(i) != offset,
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
|
||||
"bset at wrong sector offset");
|
||||
|
||||
if (!offset) {
|
||||
@ -760,16 +737,16 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
/* XXX endianness */
|
||||
btree_err_on(bp->seq != bn->keys.seq,
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
|
||||
"incorrect sequence number (wrong btree node)");
|
||||
}
|
||||
|
||||
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
|
||||
"incorrect btree id");
|
||||
|
||||
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
|
||||
"incorrect level");
|
||||
|
||||
if (!write)
|
||||
@ -786,7 +763,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
|
||||
"incorrect min_key: got %s should be %s",
|
||||
(printbuf_reset(&buf1),
|
||||
bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
|
||||
@ -795,7 +772,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
|
||||
"incorrect max key %s",
|
||||
(printbuf_reset(&buf1),
|
||||
bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
|
||||
@ -804,10 +781,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
||||
compat_btree_node(b->c.level, b->c.btree_id, version,
|
||||
BSET_BIG_ENDIAN(i), write, bn);
|
||||
|
||||
err = bch2_bkey_format_validate(&bn->format);
|
||||
btree_err_on(err,
|
||||
BTREE_ERR_BAD_NODE, c, ca, b, i,
|
||||
"invalid bkey format: %s", err);
|
||||
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
|
||||
-BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
|
||||
"invalid bkey format: %s\n %s", buf1.buf,
|
||||
(printbuf_reset(&buf2),
|
||||
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
|
||||
printbuf_reset(&buf1);
|
||||
|
||||
compat_bformat(b->c.level, b->c.btree_id, version,
|
||||
BSET_BIG_ENDIAN(i), write,
|
||||
@ -847,14 +826,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
||||
struct bkey tmp;
|
||||
|
||||
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
|
||||
"key extends past end of bset")) {
|
||||
i->u64s = cpu_to_le16((u64 *) k - i->_data);
|
||||
break;
|
||||
}
|
||||
|
||||
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
|
||||
"invalid bkey format %u", k->format)) {
|
||||
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
||||
memmove_u64s_down(k, bkey_p_next(k),
|
||||
@ -878,7 +857,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
||||
prt_printf(&buf, "\n ");
|
||||
bch2_bkey_val_to_text(&buf, c, u.s_c);
|
||||
|
||||
btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
|
||||
btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
|
||||
|
||||
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
||||
memmove_u64s_down(k, bkey_p_next(k),
|
||||
@ -902,7 +881,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
||||
|
||||
bch2_dump_bset(c, b, i, 0);
|
||||
|
||||
if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
|
||||
if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
|
||||
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
||||
memmove_u64s_down(k, bkey_p_next(k),
|
||||
(u64 *) vstruct_end(i) - (u64 *) k);
|
||||
@ -945,16 +924,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
iter->size = (btree_blocks(c) + 1) * 2;
|
||||
|
||||
if (bch2_meta_read_fault("btree"))
|
||||
btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
||||
btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
|
||||
"dynamic fault");
|
||||
|
||||
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
|
||||
"bad magic: want %llx, got %llx",
|
||||
bset_magic(c), le64_to_cpu(b->data->magic));
|
||||
|
||||
btree_err_on(!b->data->keys.seq,
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
|
||||
"bad btree header: seq 0");
|
||||
|
||||
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
||||
@ -962,7 +941,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
&bkey_i_to_btree_ptr_v2(&b->key)->v;
|
||||
|
||||
btree_err_on(b->data->keys.seq != bp->seq,
|
||||
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
|
||||
"got wrong btree node (seq %llx want %llx)",
|
||||
b->data->keys.seq, bp->seq);
|
||||
}
|
||||
@ -977,7 +956,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
i = &b->data->keys;
|
||||
|
||||
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
|
||||
"unknown checksum type %llu",
|
||||
BSET_CSUM_TYPE(i));
|
||||
|
||||
@ -985,7 +964,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
|
||||
|
||||
btree_err_on(bch2_crc_cmp(csum, b->data->csum),
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
|
||||
"invalid checksum");
|
||||
|
||||
ret = bset_encrypt(c, i, b->written << 9);
|
||||
@ -995,7 +974,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
|
||||
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
|
||||
BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
|
||||
"btree node does not have NEW_EXTENT_OVERWRITE set");
|
||||
|
||||
sectors = vstruct_sectors(b->data, c->block_bits);
|
||||
@ -1007,7 +986,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
break;
|
||||
|
||||
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
|
||||
"unknown checksum type %llu",
|
||||
BSET_CSUM_TYPE(i));
|
||||
|
||||
@ -1015,7 +994,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
||||
|
||||
btree_err_on(bch2_crc_cmp(csum, bne->csum),
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
|
||||
"invalid checksum");
|
||||
|
||||
ret = bset_encrypt(c, i, b->written << 9);
|
||||
@ -1048,12 +1027,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
true);
|
||||
|
||||
btree_err_on(blacklisted && first,
|
||||
BTREE_ERR_FIXABLE, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
|
||||
"first btree node bset has blacklisted journal seq (%llu)",
|
||||
le64_to_cpu(i->journal_seq));
|
||||
|
||||
btree_err_on(blacklisted && ptr_written,
|
||||
BTREE_ERR_FIXABLE, c, ca, b, i,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
|
||||
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
|
||||
le64_to_cpu(i->journal_seq),
|
||||
b->written, b->written + sectors, ptr_written);
|
||||
@ -1072,7 +1051,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
if (ptr_written) {
|
||||
btree_err_on(b->written < ptr_written,
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
|
||||
"btree node data missing: expected %u sectors, found %u",
|
||||
ptr_written, b->written);
|
||||
} else {
|
||||
@ -1083,7 +1062,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
!bch2_journal_seq_is_blacklisted(c,
|
||||
le64_to_cpu(bne->keys.journal_seq),
|
||||
true),
|
||||
BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
|
||||
"found bset signature after last bset");
|
||||
|
||||
/*
|
||||
@ -1137,7 +1116,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
prt_printf(&buf, "\n ");
|
||||
bch2_bkey_val_to_text(&buf, c, u.s_c);
|
||||
|
||||
btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
|
||||
btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
|
||||
|
||||
btree_keys_account_key_drop(&b->nr, 0, k);
|
||||
|
||||
@ -1177,7 +1156,8 @@ out:
|
||||
printbuf_exit(&buf);
|
||||
return retry_read;
|
||||
fsck_err:
|
||||
if (ret == BTREE_RETRY_READ)
|
||||
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
|
||||
ret == -BCH_ERR_btree_node_read_err_must_retry)
|
||||
retry_read = 1;
|
||||
else
|
||||
set_btree_node_read_error(b);
|
||||
@ -1363,14 +1343,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
|
||||
}
|
||||
|
||||
written2 = btree_node_sectors_written(c, ra->buf[i]);
|
||||
if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
|
||||
if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
|
||||
"btree node sectors written mismatch: %u != %u",
|
||||
written, written2) ||
|
||||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
|
||||
"found bset signature after last bset") ||
|
||||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
|
||||
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
|
||||
"btree node replicas content mismatch"))
|
||||
dump_bset_maps = true;
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "bkey_buf.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_key_cache.h"
|
||||
#include "btree_locking.h"
|
||||
#include "btree_update.h"
|
||||
@ -12,7 +13,6 @@
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "journal.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
#include "subvolume.h"
|
||||
#include "trace.h"
|
||||
|
@ -221,6 +221,22 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo
|
||||
unsigned, unsigned, unsigned, unsigned long);
|
||||
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
|
||||
|
||||
/*
|
||||
* bch2_btree_path_peek_slot() for a cached iterator might return a key in a
|
||||
* different snapshot:
|
||||
*/
|
||||
static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
|
||||
{
|
||||
struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
|
||||
|
||||
if (k.k && bpos_eq(path->pos, k.k->p))
|
||||
return k;
|
||||
|
||||
bkey_init(u);
|
||||
u->p = path->pos;
|
||||
return (struct bkey_s_c) { u, NULL };
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
|
||||
struct btree_iter *, struct bpos);
|
||||
|
||||
|
531
libbcachefs/btree_journal_iter.c
Normal file
531
libbcachefs/btree_journal_iter.c
Normal file
@ -0,0 +1,531 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bset.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "journal_io.h"
|
||||
|
||||
#include <linux/sort.h>
|
||||
|
||||
/*
|
||||
* For managing keys we read from the journal: until journal replay works normal
|
||||
* btree lookups need to be able to find and return keys from the journal where
|
||||
* they overwrite what's in the btree, so we have a special iterator and
|
||||
* operations for the regular btree iter code to use:
|
||||
*/
|
||||
|
||||
static int __journal_key_cmp(enum btree_id l_btree_id,
|
||||
unsigned l_level,
|
||||
struct bpos l_pos,
|
||||
const struct journal_key *r)
|
||||
{
|
||||
return (cmp_int(l_btree_id, r->btree_id) ?:
|
||||
cmp_int(l_level, r->level) ?:
|
||||
bpos_cmp(l_pos, r->k->k.p));
|
||||
}
|
||||
|
||||
static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
|
||||
{
|
||||
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
|
||||
}
|
||||
|
||||
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
|
||||
{
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
|
||||
if (idx >= keys->gap)
|
||||
idx += gap_size;
|
||||
return idx;
|
||||
}
|
||||
|
||||
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
|
||||
{
|
||||
return keys->d + idx_to_pos(keys, idx);
|
||||
}
|
||||
|
||||
static size_t __bch2_journal_key_search(struct journal_keys *keys,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
size_t l = 0, r = keys->nr, m;
|
||||
|
||||
while (l < r) {
|
||||
m = l + ((r - l) >> 1);
|
||||
if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
|
||||
l = m + 1;
|
||||
else
|
||||
r = m;
|
||||
}
|
||||
|
||||
BUG_ON(l < keys->nr &&
|
||||
__journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
|
||||
|
||||
BUG_ON(l &&
|
||||
__journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
|
||||
|
||||
return l;
|
||||
}
|
||||
|
||||
static size_t bch2_journal_key_search(struct journal_keys *keys,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos,
|
||||
struct bpos end_pos, size_t *idx)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
unsigned iters = 0;
|
||||
struct journal_key *k;
|
||||
search:
|
||||
if (!*idx)
|
||||
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
|
||||
|
||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
|
||||
return NULL;
|
||||
|
||||
if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
|
||||
!k->overwritten)
|
||||
return k->k;
|
||||
|
||||
(*idx)++;
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
*idx = 0;
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
size_t idx = 0;
|
||||
|
||||
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
|
||||
}
|
||||
|
||||
static void journal_iters_fix(struct bch_fs *c)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
/* The key we just inserted is immediately before the gap: */
|
||||
size_t gap_end = keys->gap + (keys->size - keys->nr);
|
||||
struct btree_and_journal_iter *iter;
|
||||
|
||||
/*
|
||||
* If an iterator points one after the key we just inserted, decrement
|
||||
* the iterator so it points at the key we just inserted - if the
|
||||
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
|
||||
* handle that:
|
||||
*/
|
||||
list_for_each_entry(iter, &c->journal_iters, journal.list)
|
||||
if (iter->journal.idx == gap_end)
|
||||
iter->journal.idx = keys->gap - 1;
|
||||
}
|
||||
|
||||
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
struct journal_iter *iter;
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
|
||||
list_for_each_entry(iter, &c->journal_iters, list) {
|
||||
if (iter->idx > old_gap)
|
||||
iter->idx -= gap_size;
|
||||
if (iter->idx >= new_gap)
|
||||
iter->idx += gap_size;
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
|
||||
unsigned level, struct bkey_i *k)
|
||||
{
|
||||
struct journal_key n = {
|
||||
.btree_id = id,
|
||||
.level = level,
|
||||
.k = k,
|
||||
.allocated = true,
|
||||
/*
|
||||
* Ensure these keys are done last by journal replay, to unblock
|
||||
* journal reclaim:
|
||||
*/
|
||||
.journal_seq = U32_MAX,
|
||||
};
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
|
||||
|
||||
BUG_ON(test_bit(BCH_FS_RW, &c->flags));
|
||||
|
||||
if (idx < keys->size &&
|
||||
journal_key_cmp(&n, &keys->d[idx]) == 0) {
|
||||
if (keys->d[idx].allocated)
|
||||
kfree(keys->d[idx].k);
|
||||
keys->d[idx] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (idx > keys->gap)
|
||||
idx -= keys->size - keys->nr;
|
||||
|
||||
if (keys->nr == keys->size) {
|
||||
struct journal_keys new_keys = {
|
||||
.nr = keys->nr,
|
||||
.size = max_t(size_t, keys->size, 8) * 2,
|
||||
};
|
||||
|
||||
new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
|
||||
if (!new_keys.d) {
|
||||
bch_err(c, "%s: error allocating new key array (size %zu)",
|
||||
__func__, new_keys.size);
|
||||
return -BCH_ERR_ENOMEM_journal_key_insert;
|
||||
}
|
||||
|
||||
/* Since @keys was full, there was no gap: */
|
||||
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
|
||||
kvfree(keys->d);
|
||||
*keys = new_keys;
|
||||
|
||||
/* And now the gap is at the end: */
|
||||
keys->gap = keys->nr;
|
||||
}
|
||||
|
||||
journal_iters_move_gap(c, keys->gap, idx);
|
||||
|
||||
move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
|
||||
keys->gap = idx;
|
||||
|
||||
keys->nr++;
|
||||
keys->d[keys->gap++] = n;
|
||||
|
||||
journal_iters_fix(c);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Can only be used from the recovery thread while we're still RO - can't be
|
||||
* used once we've got RW, as journal_keys is at that point used by multiple
|
||||
* threads:
|
||||
*/
|
||||
int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
|
||||
unsigned level, struct bkey_i *k)
|
||||
{
|
||||
struct bkey_i *n;
|
||||
int ret;
|
||||
|
||||
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
|
||||
if (!n)
|
||||
return -BCH_ERR_ENOMEM_journal_key_insert;
|
||||
|
||||
bkey_copy(n, k);
|
||||
ret = bch2_journal_key_insert_take(c, id, level, n);
|
||||
if (ret)
|
||||
kfree(n);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
struct bkey_i whiteout;
|
||||
|
||||
bkey_init(&whiteout.k);
|
||||
whiteout.k.p = pos;
|
||||
|
||||
return bch2_journal_key_insert(c, id, level, &whiteout);
|
||||
}
|
||||
|
||||
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
|
||||
|
||||
if (idx < keys->size &&
|
||||
keys->d[idx].btree_id == btree &&
|
||||
keys->d[idx].level == level &&
|
||||
bpos_eq(keys->d[idx].k->k.p, pos))
|
||||
keys->d[idx].overwritten = true;
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||
{
|
||||
if (iter->idx < iter->keys->size) {
|
||||
iter->idx++;
|
||||
if (iter->idx == iter->keys->gap)
|
||||
iter->idx += iter->keys->size - iter->keys->nr;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
||||
{
|
||||
struct journal_key *k = iter->keys->d + iter->idx;
|
||||
|
||||
while (k < iter->keys->d + iter->keys->size &&
|
||||
k->btree_id == iter->btree_id &&
|
||||
k->level == iter->level) {
|
||||
if (!k->overwritten)
|
||||
return bkey_i_to_s_c(k->k);
|
||||
|
||||
bch2_journal_iter_advance(iter);
|
||||
k = iter->keys->d + iter->idx;
|
||||
}
|
||||
|
||||
return bkey_s_c_null;
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_exit(struct journal_iter *iter)
|
||||
{
|
||||
list_del(&iter->list);
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_init(struct bch_fs *c,
|
||||
struct journal_iter *iter,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
iter->btree_id = id;
|
||||
iter->level = level;
|
||||
iter->keys = &c->journal_keys;
|
||||
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
|
||||
}
|
||||
|
||||
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
|
||||
iter->b, &iter->unpacked);
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
if (bpos_eq(iter->pos, SPOS_MAX))
|
||||
iter->at_end = true;
|
||||
else
|
||||
iter->pos = bpos_successor(iter->pos);
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
struct bkey_s_c btree_k, journal_k, ret;
|
||||
again:
|
||||
if (iter->at_end)
|
||||
return bkey_s_c_null;
|
||||
|
||||
while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
|
||||
bpos_lt(btree_k.k->p, iter->pos))
|
||||
bch2_journal_iter_advance_btree(iter);
|
||||
|
||||
while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
|
||||
bpos_lt(journal_k.k->p, iter->pos))
|
||||
bch2_journal_iter_advance(&iter->journal);
|
||||
|
||||
ret = journal_k.k &&
|
||||
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
|
||||
? journal_k
|
||||
: btree_k;
|
||||
|
||||
if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
|
||||
ret = bkey_s_c_null;
|
||||
|
||||
if (ret.k) {
|
||||
iter->pos = ret.k->p;
|
||||
if (bkey_deleted(ret.k)) {
|
||||
bch2_btree_and_journal_iter_advance(iter);
|
||||
goto again;
|
||||
}
|
||||
} else {
|
||||
iter->pos = SPOS_MAX;
|
||||
iter->at_end = true;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
bch2_journal_iter_exit(&iter->journal);
|
||||
}
|
||||
|
||||
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
|
||||
struct bch_fs *c,
|
||||
struct btree *b,
|
||||
struct btree_node_iter node_iter,
|
||||
struct bpos pos)
|
||||
{
|
||||
memset(iter, 0, sizeof(*iter));
|
||||
|
||||
iter->b = b;
|
||||
iter->node_iter = node_iter;
|
||||
bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
|
||||
INIT_LIST_HEAD(&iter->journal.list);
|
||||
iter->pos = b->data->min_key;
|
||||
iter->at_end = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* this version is used by btree_gc before filesystem has gone RW and
|
||||
* multithreaded, so uses the journal_iters list:
|
||||
*/
|
||||
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
|
||||
struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
struct btree_node_iter node_iter;
|
||||
|
||||
bch2_btree_node_iter_init_from_start(&node_iter, b);
|
||||
__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
|
||||
list_add(&iter->journal.list, &c->journal_iters);
|
||||
}
|
||||
|
||||
/* sort and dedup all keys in the journal: */
|
||||
|
||||
void bch2_journal_entries_free(struct bch_fs *c)
|
||||
{
|
||||
struct journal_replay **i;
|
||||
struct genradix_iter iter;
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, i)
|
||||
if (*i)
|
||||
kvpfree(*i, offsetof(struct journal_replay, j) +
|
||||
vstruct_bytes(&(*i)->j));
|
||||
genradix_free(&c->journal_entries);
|
||||
}
|
||||
|
||||
/*
|
||||
* When keys compare equal, oldest compares first:
|
||||
*/
|
||||
static int journal_sort_key_cmp(const void *_l, const void *_r)
|
||||
{
|
||||
const struct journal_key *l = _l;
|
||||
const struct journal_key *r = _r;
|
||||
|
||||
return journal_key_cmp(l, r) ?:
|
||||
cmp_int(l->journal_seq, r->journal_seq) ?:
|
||||
cmp_int(l->journal_offset, r->journal_offset);
|
||||
}
|
||||
|
||||
void bch2_journal_keys_free(struct journal_keys *keys)
|
||||
{
|
||||
struct journal_key *i;
|
||||
|
||||
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
|
||||
keys->gap = keys->nr;
|
||||
|
||||
for (i = keys->d; i < keys->d + keys->nr; i++)
|
||||
if (i->allocated)
|
||||
kfree(i->k);
|
||||
|
||||
kvfree(keys->d);
|
||||
keys->d = NULL;
|
||||
keys->nr = keys->gap = keys->size = 0;
|
||||
}
|
||||
|
||||
static void __journal_keys_sort(struct journal_keys *keys)
|
||||
{
|
||||
struct journal_key *src, *dst;
|
||||
|
||||
sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
|
||||
|
||||
src = dst = keys->d;
|
||||
while (src < keys->d + keys->nr) {
|
||||
while (src + 1 < keys->d + keys->nr &&
|
||||
src[0].btree_id == src[1].btree_id &&
|
||||
src[0].level == src[1].level &&
|
||||
bpos_eq(src[0].k->k.p, src[1].k->k.p))
|
||||
src++;
|
||||
|
||||
*dst++ = *src++;
|
||||
}
|
||||
|
||||
keys->nr = dst - keys->d;
|
||||
}
|
||||
|
||||
int bch2_journal_keys_sort(struct bch_fs *c)
|
||||
{
|
||||
struct genradix_iter iter;
|
||||
struct journal_replay *i, **_i;
|
||||
struct jset_entry *entry;
|
||||
struct bkey_i *k;
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t nr_keys = 0, nr_read = 0;
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, _i) {
|
||||
i = *_i;
|
||||
|
||||
if (!i || i->ignore)
|
||||
continue;
|
||||
|
||||
for_each_jset_key(k, entry, &i->j)
|
||||
nr_keys++;
|
||||
}
|
||||
|
||||
if (!nr_keys)
|
||||
return 0;
|
||||
|
||||
keys->size = roundup_pow_of_two(nr_keys);
|
||||
|
||||
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
|
||||
if (!keys->d) {
|
||||
bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
|
||||
nr_keys);
|
||||
|
||||
do {
|
||||
keys->size >>= 1;
|
||||
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
|
||||
} while (!keys->d && keys->size > nr_keys / 8);
|
||||
|
||||
if (!keys->d) {
|
||||
bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
|
||||
keys->size);
|
||||
return -BCH_ERR_ENOMEM_journal_keys_sort;
|
||||
}
|
||||
}
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, _i) {
|
||||
i = *_i;
|
||||
|
||||
if (!i || i->ignore)
|
||||
continue;
|
||||
|
||||
cond_resched();
|
||||
|
||||
for_each_jset_key(k, entry, &i->j) {
|
||||
if (keys->nr == keys->size) {
|
||||
__journal_keys_sort(keys);
|
||||
|
||||
if (keys->nr > keys->size * 7 / 8) {
|
||||
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
|
||||
keys->nr, keys->size, nr_read, nr_keys);
|
||||
return -BCH_ERR_ENOMEM_journal_keys_sort;
|
||||
}
|
||||
}
|
||||
|
||||
keys->d[keys->nr++] = (struct journal_key) {
|
||||
.btree_id = entry->btree_id,
|
||||
.level = entry->level,
|
||||
.k = k,
|
||||
.journal_seq = le64_to_cpu(i->j.seq),
|
||||
.journal_offset = k->_data - i->j._data,
|
||||
};
|
||||
|
||||
nr_read++;
|
||||
}
|
||||
}
|
||||
|
||||
__journal_keys_sort(keys);
|
||||
keys->gap = keys->nr;
|
||||
|
||||
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
|
||||
return 0;
|
||||
}
|
57
libbcachefs/btree_journal_iter.h
Normal file
57
libbcachefs/btree_journal_iter.h
Normal file
@ -0,0 +1,57 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
|
||||
#define _BCACHEFS_BTREE_JOURNAL_ITER_H
|
||||
|
||||
struct journal_iter {
|
||||
struct list_head list;
|
||||
enum btree_id btree_id;
|
||||
unsigned level;
|
||||
size_t idx;
|
||||
struct journal_keys *keys;
|
||||
};
|
||||
|
||||
/*
|
||||
* Iterate over keys in the btree, with keys from the journal overlaid on top:
|
||||
*/
|
||||
|
||||
struct btree_and_journal_iter {
|
||||
struct btree *b;
|
||||
struct btree_node_iter node_iter;
|
||||
struct bkey unpacked;
|
||||
|
||||
struct journal_iter journal;
|
||||
struct bpos pos;
|
||||
bool at_end;
|
||||
};
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
|
||||
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bkey_i *);
|
||||
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bkey_i *);
|
||||
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
|
||||
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
|
||||
|
||||
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
|
||||
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
|
||||
struct bch_fs *, struct btree *,
|
||||
struct btree_node_iter, struct bpos);
|
||||
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
|
||||
struct bch_fs *,
|
||||
struct btree *);
|
||||
|
||||
void bch2_journal_keys_free(struct journal_keys *);
|
||||
void bch2_journal_entries_free(struct bch_fs *);
|
||||
|
||||
int bch2_journal_keys_sort(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
|
@ -1,45 +1,22 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_io.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_key_cache.h"
|
||||
#include "btree_locking.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "buckets.h"
|
||||
#include "debug.h"
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "extent_update.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
#include "recovery.h"
|
||||
#include "subvolume.h"
|
||||
#include "replicas.h"
|
||||
#include "trace.h"
|
||||
#include "subvolume.h"
|
||||
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
/*
|
||||
* bch2_btree_path_peek_slot() for a cached iterator might return a key in a
|
||||
* different snapshot:
|
||||
*/
|
||||
static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
|
||||
{
|
||||
struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
|
||||
|
||||
if (k.k && bpos_eq(path->pos, k.k->p))
|
||||
return k;
|
||||
|
||||
bkey_init(u);
|
||||
u->p = path->pos;
|
||||
return (struct bkey_s_c) { u, NULL };
|
||||
}
|
||||
|
||||
static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
|
||||
{
|
||||
@ -64,20 +41,6 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
|
||||
#endif
|
||||
}
|
||||
|
||||
static int __must_check
|
||||
bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
|
||||
struct bkey_i *, enum btree_update_flags,
|
||||
unsigned long ip);
|
||||
|
||||
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
|
||||
const struct btree_insert_entry *r)
|
||||
{
|
||||
return cmp_int(l->btree_id, r->btree_id) ?:
|
||||
cmp_int(l->cached, r->cached) ?:
|
||||
-cmp_int(l->level, r->level) ?:
|
||||
bpos_cmp(l->k->k.p, r->k->k.p);
|
||||
}
|
||||
|
||||
static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
|
||||
{
|
||||
return i->path->l + i->level;
|
||||
@ -1191,917 +1154,3 @@ err:
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
|
||||
enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, pos,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_ALL_SNAPSHOTS);
|
||||
while (1) {
|
||||
k = bch2_btree_iter_prev(&iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (!k.k)
|
||||
break;
|
||||
|
||||
if (!bkey_eq(pos, k.k->p))
|
||||
break;
|
||||
|
||||
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
|
||||
enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
if (!btree_type_has_snapshots(id) ||
|
||||
bch2_snapshot_is_leaf(trans->c, pos.snapshot))
|
||||
return 0;
|
||||
|
||||
return __check_pos_snapshot_overwritten(trans, id, pos);
|
||||
}
|
||||
|
||||
static noinline int extent_front_merge(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k,
|
||||
struct bkey_i **insert,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_i *update;
|
||||
int ret;
|
||||
|
||||
update = bch2_bkey_make_mut_noupdate(trans, k);
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
|
||||
return 0;
|
||||
|
||||
ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
|
||||
check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
return 0;
|
||||
|
||||
ret = bch2_btree_delete_at(trans, iter, flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
*insert = update;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static noinline int extent_back_merge(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_i *insert,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret;
|
||||
|
||||
ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
|
||||
check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
return 0;
|
||||
|
||||
bch2_bkey_merge(c, bkey_i_to_s(insert), k);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* When deleting, check if we need to emit a whiteout (because we're overwriting
|
||||
* something in an ancestor snapshot)
|
||||
*/
|
||||
static int need_whiteout_for_snapshot(struct btree_trans *trans,
|
||||
enum btree_id btree_id, struct bpos pos)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u32 snapshot = pos.snapshot;
|
||||
int ret;
|
||||
|
||||
if (!bch2_snapshot_parent(trans->c, pos.snapshot))
|
||||
return 0;
|
||||
|
||||
pos.snapshot++;
|
||||
|
||||
for_each_btree_key_norestart(trans, iter, btree_id, pos,
|
||||
BTREE_ITER_ALL_SNAPSHOTS|
|
||||
BTREE_ITER_NOPRESERVE, k, ret) {
|
||||
if (!bkey_eq(k.k->p, pos))
|
||||
break;
|
||||
|
||||
if (bch2_snapshot_is_ancestor(trans->c, snapshot,
|
||||
k.k->p.snapshot)) {
|
||||
ret = !bkey_whiteout(k.k);
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
|
||||
enum btree_id id,
|
||||
struct bpos old_pos,
|
||||
struct bpos new_pos)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter old_iter, new_iter = { NULL };
|
||||
struct bkey_s_c old_k, new_k;
|
||||
snapshot_id_list s;
|
||||
struct bkey_i *update;
|
||||
int ret;
|
||||
|
||||
if (!bch2_snapshot_has_children(c, old_pos.snapshot))
|
||||
return 0;
|
||||
|
||||
darray_init(&s);
|
||||
|
||||
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_ALL_SNAPSHOTS);
|
||||
while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
|
||||
!(ret = bkey_err(old_k)) &&
|
||||
bkey_eq(old_pos, old_k.k->p)) {
|
||||
struct bpos whiteout_pos =
|
||||
SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
|
||||
|
||||
if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
|
||||
snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
|
||||
continue;
|
||||
|
||||
new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bkey_err(new_k);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (new_k.k->type == KEY_TYPE_deleted) {
|
||||
update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
bkey_init(&update->k);
|
||||
update->k.p = whiteout_pos;
|
||||
update->k.type = KEY_TYPE_whiteout;
|
||||
|
||||
ret = bch2_trans_update(trans, &new_iter, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &new_iter);
|
||||
|
||||
ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &new_iter);
|
||||
bch2_trans_iter_exit(trans, &old_iter);
|
||||
darray_exit(&s);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
enum btree_update_flags flags,
|
||||
struct bkey_s_c old,
|
||||
struct bkey_s_c new)
|
||||
{
|
||||
enum btree_id btree_id = iter->btree_id;
|
||||
struct bkey_i *update;
|
||||
struct bpos new_start = bkey_start_pos(new.k);
|
||||
bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
|
||||
bool back_split = bkey_gt(old.k->p, new.k->p);
|
||||
int ret = 0, compressed_sectors;
|
||||
|
||||
/*
|
||||
* If we're going to be splitting a compressed extent, note it
|
||||
* so that __bch2_trans_commit() can increase our disk
|
||||
* reservation:
|
||||
*/
|
||||
if (((front_split && back_split) ||
|
||||
((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
|
||||
(compressed_sectors = bch2_bkey_sectors_compressed(old)))
|
||||
trans->extra_journal_res += compressed_sectors;
|
||||
|
||||
if (front_split) {
|
||||
update = bch2_bkey_make_mut_noupdate(trans, old);
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bch2_cut_back(new_start, update);
|
||||
|
||||
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
|
||||
old.k->p, update->k.p) ?:
|
||||
bch2_btree_insert_nonextent(trans, btree_id, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If we're overwriting in a different snapshot - middle split: */
|
||||
if (old.k->p.snapshot != new.k->p.snapshot &&
|
||||
(front_split || back_split)) {
|
||||
update = bch2_bkey_make_mut_noupdate(trans, old);
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bch2_cut_front(new_start, update);
|
||||
bch2_cut_back(new.k->p, update);
|
||||
|
||||
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
|
||||
old.k->p, update->k.p) ?:
|
||||
bch2_btree_insert_nonextent(trans, btree_id, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (bkey_le(old.k->p, new.k->p)) {
|
||||
update = bch2_trans_kmalloc(trans, sizeof(*update));
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bkey_init(&update->k);
|
||||
update->k.p = old.k->p;
|
||||
update->k.p.snapshot = new.k->p.snapshot;
|
||||
|
||||
if (new.k->p.snapshot != old.k->p.snapshot) {
|
||||
update->k.type = KEY_TYPE_whiteout;
|
||||
} else if (btree_type_has_snapshots(btree_id)) {
|
||||
ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
update->k.type = KEY_TYPE_whiteout;
|
||||
}
|
||||
|
||||
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (back_split) {
|
||||
update = bch2_bkey_make_mut_noupdate(trans, old);
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bch2_cut_front(new.k->p, update);
|
||||
|
||||
ret = bch2_trans_update_by_path(trans, iter->path, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
|
||||
flags, _RET_IP_);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_trans_update_extent(struct btree_trans *trans,
|
||||
struct btree_iter *orig_iter,
|
||||
struct bkey_i *insert,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
enum btree_id btree_id = orig_iter->btree_id;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
|
||||
BTREE_ITER_INTENT|
|
||||
BTREE_ITER_WITH_UPDATES|
|
||||
BTREE_ITER_NOT_EXTENTS);
|
||||
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto err;
|
||||
if (!k.k)
|
||||
goto out;
|
||||
|
||||
if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
|
||||
if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
|
||||
ret = extent_front_merge(trans, &iter, k, &insert, flags);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
goto next;
|
||||
}
|
||||
|
||||
while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
|
||||
bool done = bkey_lt(insert->k.p, k.k->p);
|
||||
|
||||
ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (done)
|
||||
goto out;
|
||||
next:
|
||||
bch2_btree_iter_advance(&iter);
|
||||
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto err;
|
||||
if (!k.k)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
|
||||
ret = extent_back_merge(trans, &iter, insert, k);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
out:
|
||||
if (!bkey_deleted(&insert->k))
|
||||
ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline int flush_new_cached_update(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
struct btree_insert_entry *i,
|
||||
enum btree_update_flags flags,
|
||||
unsigned long ip)
|
||||
{
|
||||
struct btree_path *btree_path;
|
||||
struct bkey k;
|
||||
int ret;
|
||||
|
||||
btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
|
||||
BTREE_ITER_INTENT, _THIS_IP_);
|
||||
ret = bch2_btree_path_traverse(trans, btree_path, 0);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* The old key in the insert entry might actually refer to an existing
|
||||
* key in the btree that has been deleted from cache and not yet
|
||||
* flushed. Check for this and skip the flush so we don't run triggers
|
||||
* against a stale key.
|
||||
*/
|
||||
bch2_btree_path_peek_slot_exact(btree_path, &k);
|
||||
if (!bkey_deleted(&k))
|
||||
goto out;
|
||||
|
||||
i->key_cache_already_flushed = true;
|
||||
i->flags |= BTREE_TRIGGER_NORUN;
|
||||
|
||||
btree_path_set_should_be_locked(btree_path);
|
||||
ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
|
||||
out:
|
||||
bch2_path_put(trans, btree_path, true);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __must_check
|
||||
bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
|
||||
struct bkey_i *k, enum btree_update_flags flags,
|
||||
unsigned long ip)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_insert_entry *i, n;
|
||||
u64 seq = 0;
|
||||
int cmp;
|
||||
|
||||
EBUG_ON(!path->should_be_locked);
|
||||
EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
|
||||
EBUG_ON(!bpos_eq(k->k.p, path->pos));
|
||||
|
||||
/*
|
||||
* The transaction journal res hasn't been allocated at this point.
|
||||
* That occurs at commit time. Reuse the seq field to pass in the seq
|
||||
* of a prejournaled key.
|
||||
*/
|
||||
if (flags & BTREE_UPDATE_PREJOURNAL)
|
||||
seq = trans->journal_res.seq;
|
||||
|
||||
n = (struct btree_insert_entry) {
|
||||
.flags = flags,
|
||||
.bkey_type = __btree_node_type(path->level, path->btree_id),
|
||||
.btree_id = path->btree_id,
|
||||
.level = path->level,
|
||||
.cached = path->cached,
|
||||
.path = path,
|
||||
.k = k,
|
||||
.seq = seq,
|
||||
.ip_allocated = ip,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
trans_for_each_update(trans, i)
|
||||
BUG_ON(i != trans->updates &&
|
||||
btree_insert_entry_cmp(i - 1, i) >= 0);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Pending updates are kept sorted: first, find position of new update,
|
||||
* then delete/trim any updates the new update overwrites:
|
||||
*/
|
||||
trans_for_each_update(trans, i) {
|
||||
cmp = btree_insert_entry_cmp(&n, i);
|
||||
if (cmp <= 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!cmp && i < trans->updates + trans->nr_updates) {
|
||||
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
|
||||
|
||||
bch2_path_put(trans, i->path, true);
|
||||
i->flags = n.flags;
|
||||
i->cached = n.cached;
|
||||
i->k = n.k;
|
||||
i->path = n.path;
|
||||
i->seq = n.seq;
|
||||
i->ip_allocated = n.ip_allocated;
|
||||
} else {
|
||||
array_insert_item(trans->updates, trans->nr_updates,
|
||||
i - trans->updates, n);
|
||||
|
||||
i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
|
||||
i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
|
||||
|
||||
if (unlikely(trans->journal_replay_not_finished)) {
|
||||
struct bkey_i *j_k =
|
||||
bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
|
||||
|
||||
if (j_k) {
|
||||
i->old_k = j_k->k;
|
||||
i->old_v = &j_k->v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__btree_path_get(i->path, true);
|
||||
|
||||
/*
|
||||
* If a key is present in the key cache, it must also exist in the
|
||||
* btree - this is necessary for cache coherency. When iterating over
|
||||
* a btree that's cached in the key cache, the btree iter code checks
|
||||
* the key cache - but the key has to exist in the btree for that to
|
||||
* work:
|
||||
*/
|
||||
if (path->cached && bkey_deleted(&i->old_k))
|
||||
return flush_new_cached_update(trans, path, i, flags, ip);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bkey_i *k, enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_path *path = iter->update_path ?: iter->path;
|
||||
struct bkey_cached *ck;
|
||||
int ret;
|
||||
|
||||
if (iter->flags & BTREE_ITER_IS_EXTENTS)
|
||||
return bch2_trans_update_extent(trans, iter, k, flags);
|
||||
|
||||
if (bkey_deleted(&k->k) &&
|
||||
!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
|
||||
(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
|
||||
ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
|
||||
if (ret)
|
||||
k->k.type = KEY_TYPE_whiteout;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that updates to cached btrees go to the key cache:
|
||||
*/
|
||||
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
|
||||
!path->cached &&
|
||||
!path->level &&
|
||||
btree_id_cached(trans->c, path->btree_id)) {
|
||||
if (!iter->key_cache_path ||
|
||||
!iter->key_cache_path->should_be_locked ||
|
||||
!bpos_eq(iter->key_cache_path->pos, k->k.p)) {
|
||||
if (!iter->key_cache_path)
|
||||
iter->key_cache_path =
|
||||
bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
|
||||
BTREE_ITER_INTENT|
|
||||
BTREE_ITER_CACHED, _THIS_IP_);
|
||||
|
||||
iter->key_cache_path =
|
||||
bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
|
||||
iter->flags & BTREE_ITER_INTENT,
|
||||
_THIS_IP_);
|
||||
|
||||
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
|
||||
BTREE_ITER_CACHED);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
ck = (void *) iter->key_cache_path->l[0].b;
|
||||
|
||||
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
||||
trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
|
||||
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(iter->key_cache_path);
|
||||
}
|
||||
|
||||
path = iter->key_cache_path;
|
||||
}
|
||||
|
||||
return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a transaction update for a key that has already been journaled.
|
||||
*/
|
||||
int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
|
||||
struct btree_iter *iter, struct bkey_i *k,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
trans->journal_res.seq = seq;
|
||||
return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
|
||||
BTREE_UPDATE_PREJOURNAL);
|
||||
}
|
||||
|
||||
int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
|
||||
enum btree_id btree,
|
||||
struct bkey_i *k)
|
||||
{
|
||||
struct btree_write_buffered_key *i;
|
||||
int ret;
|
||||
|
||||
EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
|
||||
EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
|
||||
|
||||
trans_for_each_wb_update(trans, i) {
|
||||
if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
|
||||
bkey_copy(&i->k, k);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!trans->wb_updates ||
|
||||
trans->nr_wb_updates == trans->wb_updates_size) {
|
||||
struct btree_write_buffered_key *u;
|
||||
|
||||
if (trans->nr_wb_updates == trans->wb_updates_size) {
|
||||
struct btree_transaction_stats *s = btree_trans_stats(trans);
|
||||
|
||||
BUG_ON(trans->wb_updates_size > U8_MAX / 2);
|
||||
trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
|
||||
if (s)
|
||||
s->wb_updates_size = trans->wb_updates_size;
|
||||
}
|
||||
|
||||
u = bch2_trans_kmalloc_nomemzero(trans,
|
||||
trans->wb_updates_size *
|
||||
sizeof(struct btree_write_buffered_key));
|
||||
ret = PTR_ERR_OR_ZERO(u);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (trans->nr_wb_updates)
|
||||
memcpy(u, trans->wb_updates, trans->nr_wb_updates *
|
||||
sizeof(struct btree_write_buffered_key));
|
||||
trans->wb_updates = u;
|
||||
}
|
||||
|
||||
trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
|
||||
.btree = btree,
|
||||
};
|
||||
|
||||
bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
|
||||
trans->nr_wb_updates++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
|
||||
enum btree_id btree, struct bpos end)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
|
||||
k = bch2_btree_iter_prev(iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_btree_iter_advance(iter);
|
||||
k = bch2_btree_iter_peek_slot(iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
BUG_ON(k.k->type != KEY_TYPE_deleted);
|
||||
|
||||
if (bkey_gt(k.k->p, end)) {
|
||||
ret = -BCH_ERR_ENOSPC_btree_slot;
|
||||
goto err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_trans_commit_hook(struct btree_trans *trans,
|
||||
struct btree_trans_commit_hook *h)
|
||||
{
|
||||
h->next = trans->hooks;
|
||||
trans->hooks = h;
|
||||
}
|
||||
|
||||
int bch2_btree_insert_nonextent(struct btree_trans *trans,
|
||||
enum btree_id btree, struct bkey_i *k,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, btree, k->k.p,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bch2_btree_iter_traverse(&iter) ?:
|
||||
bch2_trans_update(trans, &iter, k, flags);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
|
||||
struct bkey_i *k, enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
|
||||
BTREE_ITER_CACHED|
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bch2_btree_iter_traverse(&iter) ?:
|
||||
bch2_trans_update(trans, &iter, k, flags);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_insert - insert keys into the extent btree
|
||||
* @c: pointer to struct bch_fs
|
||||
* @id: btree to insert into
|
||||
* @insert_keys: list of keys to insert
|
||||
* @hook: insert callback
|
||||
*/
|
||||
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
|
||||
struct bkey_i *k,
|
||||
struct disk_reservation *disk_res,
|
||||
u64 *journal_seq, int flags)
|
||||
{
|
||||
return bch2_trans_do(c, disk_res, journal_seq, flags,
|
||||
__bch2_btree_insert(&trans, id, k, 0));
|
||||
}
|
||||
|
||||
int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
|
||||
unsigned len, unsigned update_flags)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
||||
k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.p = iter->pos;
|
||||
bch2_key_resize(&k->k, len);
|
||||
return bch2_trans_update(trans, iter, k, update_flags);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_at(struct btree_trans *trans,
|
||||
struct btree_iter *iter, unsigned update_flags)
|
||||
{
|
||||
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_at_buffered(struct btree_trans *trans,
|
||||
enum btree_id btree, struct bpos pos)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
||||
k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.p = pos;
|
||||
return bch2_trans_update_buffered(trans, btree, k);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
|
||||
struct bpos start, struct bpos end,
|
||||
unsigned update_flags,
|
||||
u64 *journal_seq)
|
||||
{
|
||||
u32 restart_count = trans->restart_count;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
|
||||
while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
|
||||
struct disk_reservation disk_res =
|
||||
bch2_disk_reservation_init(trans->c, 0);
|
||||
struct bkey_i delete;
|
||||
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bkey_init(&delete.k);
|
||||
|
||||
/*
|
||||
* This could probably be more efficient for extents:
|
||||
*/
|
||||
|
||||
/*
|
||||
* For extents, iter.pos won't necessarily be the same as
|
||||
* bkey_start_pos(k.k) (for non extents they always will be the
|
||||
* same). It's important that we delete starting from iter.pos
|
||||
* because the range we want to delete could start in the middle
|
||||
* of k.
|
||||
*
|
||||
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
|
||||
* bkey_start_pos(k.k)).
|
||||
*/
|
||||
delete.k.p = iter.pos;
|
||||
|
||||
if (iter.flags & BTREE_ITER_IS_EXTENTS)
|
||||
bch2_key_resize(&delete.k,
|
||||
bpos_min(end, k.k->p).offset -
|
||||
iter.pos.offset);
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
|
||||
bch2_trans_commit(trans, &disk_res, journal_seq,
|
||||
BTREE_INSERT_NOFAIL);
|
||||
bch2_disk_reservation_put(trans->c, &disk_res);
|
||||
err:
|
||||
/*
|
||||
* the bch2_trans_begin() call is in a weird place because we
|
||||
* need to call it after every transaction commit, to avoid path
|
||||
* overflow, but don't want to call it if the delete operation
|
||||
* is a no-op and we have no work to do:
|
||||
*/
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
ret = 0;
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (!ret && trans_was_restarted(trans, restart_count))
|
||||
ret = -BCH_ERR_transaction_restart_nested;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* bch_btree_delete_range - delete everything within a given range
|
||||
*
|
||||
* Range is a half open interval - [start, end)
|
||||
*/
|
||||
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
|
||||
struct bpos start, struct bpos end,
|
||||
unsigned update_flags,
|
||||
u64 *journal_seq)
|
||||
{
|
||||
int ret = bch2_trans_run(c,
|
||||
bch2_btree_delete_range_trans(&trans, id, start, end,
|
||||
update_flags, journal_seq));
|
||||
if (ret == -BCH_ERR_transaction_restart_nested)
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
|
||||
struct bpos pos, bool set)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
int ret = 0;
|
||||
|
||||
k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
|
||||
ret = PTR_ERR_OR_ZERO(k);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
|
||||
k->k.p = pos;
|
||||
|
||||
return bch2_trans_update_buffered(trans, btree, k);
|
||||
}
|
||||
|
||||
static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
struct jset_entry_log *l;
|
||||
unsigned u64s;
|
||||
int ret;
|
||||
|
||||
prt_vprintf(&buf, fmt, args);
|
||||
ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
|
||||
|
||||
ret = darray_make_room(entries, jset_u64s(u64s));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
l = (void *) &darray_top(*entries);
|
||||
l->entry.u64s = cpu_to_le16(u64s);
|
||||
l->entry.btree_id = 0;
|
||||
l->entry.level = 1;
|
||||
l->entry.type = BCH_JSET_ENTRY_log;
|
||||
l->entry.pad[0] = 0;
|
||||
l->entry.pad[1] = 0;
|
||||
l->entry.pad[2] = 0;
|
||||
memcpy(l->d, buf.buf, buf.pos);
|
||||
while (buf.pos & 7)
|
||||
l->d[buf.pos++] = '\0';
|
||||
|
||||
entries->nr += jset_u64s(u64s);
|
||||
err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
|
||||
va_list args)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
|
||||
ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
|
||||
} else {
|
||||
ret = bch2_trans_do(c, NULL, NULL,
|
||||
BTREE_INSERT_LAZY_RW|commit_flags,
|
||||
__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
int ret;
|
||||
|
||||
va_start(args, fmt);
|
||||
ret = __bch2_fs_log_msg(c, 0, fmt, args);
|
||||
va_end(args);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use for logging messages during recovery to enable reserved space and avoid
|
||||
* blocking.
|
||||
*/
|
||||
int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
int ret;
|
||||
|
||||
va_start(args, fmt);
|
||||
ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
|
||||
va_end(args);
|
||||
return ret;
|
||||
}
|
943
libbcachefs/btree_update.c
Normal file
943
libbcachefs/btree_update.c
Normal file
@ -0,0 +1,943 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_locking.h"
|
||||
#include "buckets.h"
|
||||
#include "debug.h"
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "keylist.h"
|
||||
#include "subvolume.h"
|
||||
#include "trace.h"
|
||||
|
||||
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
|
||||
const struct btree_insert_entry *r)
|
||||
{
|
||||
return cmp_int(l->btree_id, r->btree_id) ?:
|
||||
cmp_int(l->cached, r->cached) ?:
|
||||
-cmp_int(l->level, r->level) ?:
|
||||
bpos_cmp(l->k->k.p, r->k->k.p);
|
||||
}
|
||||
|
||||
static int __must_check
|
||||
bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
|
||||
struct bkey_i *, enum btree_update_flags,
|
||||
unsigned long ip);
|
||||
|
||||
static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
|
||||
enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, pos,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_ALL_SNAPSHOTS);
|
||||
while (1) {
|
||||
k = bch2_btree_iter_prev(&iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (!k.k)
|
||||
break;
|
||||
|
||||
if (!bkey_eq(pos, k.k->p))
|
||||
break;
|
||||
|
||||
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
|
||||
enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
if (!btree_type_has_snapshots(id) ||
|
||||
bch2_snapshot_is_leaf(trans->c, pos.snapshot))
|
||||
return 0;
|
||||
|
||||
return __check_pos_snapshot_overwritten(trans, id, pos);
|
||||
}
|
||||
|
||||
static noinline int extent_front_merge(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k,
|
||||
struct bkey_i **insert,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_i *update;
|
||||
int ret;
|
||||
|
||||
update = bch2_bkey_make_mut_noupdate(trans, k);
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
|
||||
return 0;
|
||||
|
||||
ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
|
||||
check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
return 0;
|
||||
|
||||
ret = bch2_btree_delete_at(trans, iter, flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
*insert = update;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static noinline int extent_back_merge(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_i *insert,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret;
|
||||
|
||||
ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
|
||||
check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
return 0;
|
||||
|
||||
bch2_bkey_merge(c, bkey_i_to_s(insert), k);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* When deleting, check if we need to emit a whiteout (because we're overwriting
|
||||
* something in an ancestor snapshot)
|
||||
*/
|
||||
static int need_whiteout_for_snapshot(struct btree_trans *trans,
|
||||
enum btree_id btree_id, struct bpos pos)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u32 snapshot = pos.snapshot;
|
||||
int ret;
|
||||
|
||||
if (!bch2_snapshot_parent(trans->c, pos.snapshot))
|
||||
return 0;
|
||||
|
||||
pos.snapshot++;
|
||||
|
||||
for_each_btree_key_norestart(trans, iter, btree_id, pos,
|
||||
BTREE_ITER_ALL_SNAPSHOTS|
|
||||
BTREE_ITER_NOPRESERVE, k, ret) {
|
||||
if (!bkey_eq(k.k->p, pos))
|
||||
break;
|
||||
|
||||
if (bch2_snapshot_is_ancestor(trans->c, snapshot,
|
||||
k.k->p.snapshot)) {
|
||||
ret = !bkey_whiteout(k.k);
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
|
||||
enum btree_id id,
|
||||
struct bpos old_pos,
|
||||
struct bpos new_pos)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter old_iter, new_iter = { NULL };
|
||||
struct bkey_s_c old_k, new_k;
|
||||
snapshot_id_list s;
|
||||
struct bkey_i *update;
|
||||
int ret;
|
||||
|
||||
if (!bch2_snapshot_has_children(c, old_pos.snapshot))
|
||||
return 0;
|
||||
|
||||
darray_init(&s);
|
||||
|
||||
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_ALL_SNAPSHOTS);
|
||||
while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
|
||||
!(ret = bkey_err(old_k)) &&
|
||||
bkey_eq(old_pos, old_k.k->p)) {
|
||||
struct bpos whiteout_pos =
|
||||
SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
|
||||
|
||||
if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
|
||||
snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
|
||||
continue;
|
||||
|
||||
new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bkey_err(new_k);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (new_k.k->type == KEY_TYPE_deleted) {
|
||||
update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
bkey_init(&update->k);
|
||||
update->k.p = whiteout_pos;
|
||||
update->k.type = KEY_TYPE_whiteout;
|
||||
|
||||
ret = bch2_trans_update(trans, &new_iter, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &new_iter);
|
||||
|
||||
ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &new_iter);
|
||||
bch2_trans_iter_exit(trans, &old_iter);
|
||||
darray_exit(&s);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
enum btree_update_flags flags,
|
||||
struct bkey_s_c old,
|
||||
struct bkey_s_c new)
|
||||
{
|
||||
enum btree_id btree_id = iter->btree_id;
|
||||
struct bkey_i *update;
|
||||
struct bpos new_start = bkey_start_pos(new.k);
|
||||
bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
|
||||
bool back_split = bkey_gt(old.k->p, new.k->p);
|
||||
int ret = 0, compressed_sectors;
|
||||
|
||||
/*
|
||||
* If we're going to be splitting a compressed extent, note it
|
||||
* so that __bch2_trans_commit() can increase our disk
|
||||
* reservation:
|
||||
*/
|
||||
if (((front_split && back_split) ||
|
||||
((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
|
||||
(compressed_sectors = bch2_bkey_sectors_compressed(old)))
|
||||
trans->extra_journal_res += compressed_sectors;
|
||||
|
||||
if (front_split) {
|
||||
update = bch2_bkey_make_mut_noupdate(trans, old);
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bch2_cut_back(new_start, update);
|
||||
|
||||
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
|
||||
old.k->p, update->k.p) ?:
|
||||
bch2_btree_insert_nonextent(trans, btree_id, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If we're overwriting in a different snapshot - middle split: */
|
||||
if (old.k->p.snapshot != new.k->p.snapshot &&
|
||||
(front_split || back_split)) {
|
||||
update = bch2_bkey_make_mut_noupdate(trans, old);
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bch2_cut_front(new_start, update);
|
||||
bch2_cut_back(new.k->p, update);
|
||||
|
||||
ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
|
||||
old.k->p, update->k.p) ?:
|
||||
bch2_btree_insert_nonextent(trans, btree_id, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (bkey_le(old.k->p, new.k->p)) {
|
||||
update = bch2_trans_kmalloc(trans, sizeof(*update));
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bkey_init(&update->k);
|
||||
update->k.p = old.k->p;
|
||||
update->k.p.snapshot = new.k->p.snapshot;
|
||||
|
||||
if (new.k->p.snapshot != old.k->p.snapshot) {
|
||||
update->k.type = KEY_TYPE_whiteout;
|
||||
} else if (btree_type_has_snapshots(btree_id)) {
|
||||
ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
update->k.type = KEY_TYPE_whiteout;
|
||||
}
|
||||
|
||||
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (back_split) {
|
||||
update = bch2_bkey_make_mut_noupdate(trans, old);
|
||||
if ((ret = PTR_ERR_OR_ZERO(update)))
|
||||
return ret;
|
||||
|
||||
bch2_cut_front(new.k->p, update);
|
||||
|
||||
ret = bch2_trans_update_by_path(trans, iter->path, update,
|
||||
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
|
||||
flags, _RET_IP_);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_trans_update_extent(struct btree_trans *trans,
|
||||
struct btree_iter *orig_iter,
|
||||
struct bkey_i *insert,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
enum btree_id btree_id = orig_iter->btree_id;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
|
||||
BTREE_ITER_INTENT|
|
||||
BTREE_ITER_WITH_UPDATES|
|
||||
BTREE_ITER_NOT_EXTENTS);
|
||||
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto err;
|
||||
if (!k.k)
|
||||
goto out;
|
||||
|
||||
if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
|
||||
if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
|
||||
ret = extent_front_merge(trans, &iter, k, &insert, flags);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
goto next;
|
||||
}
|
||||
|
||||
while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
|
||||
bool done = bkey_lt(insert->k.p, k.k->p);
|
||||
|
||||
ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (done)
|
||||
goto out;
|
||||
next:
|
||||
bch2_btree_iter_advance(&iter);
|
||||
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto err;
|
||||
if (!k.k)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
|
||||
ret = extent_back_merge(trans, &iter, insert, k);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
out:
|
||||
if (!bkey_deleted(&insert->k))
|
||||
ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline int flush_new_cached_update(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
struct btree_insert_entry *i,
|
||||
enum btree_update_flags flags,
|
||||
unsigned long ip)
|
||||
{
|
||||
struct btree_path *btree_path;
|
||||
struct bkey k;
|
||||
int ret;
|
||||
|
||||
btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
|
||||
BTREE_ITER_INTENT, _THIS_IP_);
|
||||
ret = bch2_btree_path_traverse(trans, btree_path, 0);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* The old key in the insert entry might actually refer to an existing
|
||||
* key in the btree that has been deleted from cache and not yet
|
||||
* flushed. Check for this and skip the flush so we don't run triggers
|
||||
* against a stale key.
|
||||
*/
|
||||
bch2_btree_path_peek_slot_exact(btree_path, &k);
|
||||
if (!bkey_deleted(&k))
|
||||
goto out;
|
||||
|
||||
i->key_cache_already_flushed = true;
|
||||
i->flags |= BTREE_TRIGGER_NORUN;
|
||||
|
||||
btree_path_set_should_be_locked(btree_path);
|
||||
ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
|
||||
out:
|
||||
bch2_path_put(trans, btree_path, true);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __must_check
|
||||
bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
|
||||
struct bkey_i *k, enum btree_update_flags flags,
|
||||
unsigned long ip)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_insert_entry *i, n;
|
||||
u64 seq = 0;
|
||||
int cmp;
|
||||
|
||||
EBUG_ON(!path->should_be_locked);
|
||||
EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
|
||||
EBUG_ON(!bpos_eq(k->k.p, path->pos));
|
||||
|
||||
/*
|
||||
* The transaction journal res hasn't been allocated at this point.
|
||||
* That occurs at commit time. Reuse the seq field to pass in the seq
|
||||
* of a prejournaled key.
|
||||
*/
|
||||
if (flags & BTREE_UPDATE_PREJOURNAL)
|
||||
seq = trans->journal_res.seq;
|
||||
|
||||
n = (struct btree_insert_entry) {
|
||||
.flags = flags,
|
||||
.bkey_type = __btree_node_type(path->level, path->btree_id),
|
||||
.btree_id = path->btree_id,
|
||||
.level = path->level,
|
||||
.cached = path->cached,
|
||||
.path = path,
|
||||
.k = k,
|
||||
.seq = seq,
|
||||
.ip_allocated = ip,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
trans_for_each_update(trans, i)
|
||||
BUG_ON(i != trans->updates &&
|
||||
btree_insert_entry_cmp(i - 1, i) >= 0);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Pending updates are kept sorted: first, find position of new update,
|
||||
* then delete/trim any updates the new update overwrites:
|
||||
*/
|
||||
trans_for_each_update(trans, i) {
|
||||
cmp = btree_insert_entry_cmp(&n, i);
|
||||
if (cmp <= 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!cmp && i < trans->updates + trans->nr_updates) {
|
||||
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
|
||||
|
||||
bch2_path_put(trans, i->path, true);
|
||||
i->flags = n.flags;
|
||||
i->cached = n.cached;
|
||||
i->k = n.k;
|
||||
i->path = n.path;
|
||||
i->seq = n.seq;
|
||||
i->ip_allocated = n.ip_allocated;
|
||||
} else {
|
||||
array_insert_item(trans->updates, trans->nr_updates,
|
||||
i - trans->updates, n);
|
||||
|
||||
i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
|
||||
i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
|
||||
|
||||
if (unlikely(trans->journal_replay_not_finished)) {
|
||||
struct bkey_i *j_k =
|
||||
bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
|
||||
|
||||
if (j_k) {
|
||||
i->old_k = j_k->k;
|
||||
i->old_v = &j_k->v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__btree_path_get(i->path, true);
|
||||
|
||||
/*
|
||||
* If a key is present in the key cache, it must also exist in the
|
||||
* btree - this is necessary for cache coherency. When iterating over
|
||||
* a btree that's cached in the key cache, the btree iter code checks
|
||||
* the key cache - but the key has to exist in the btree for that to
|
||||
* work:
|
||||
*/
|
||||
if (path->cached && bkey_deleted(&i->old_k))
|
||||
return flush_new_cached_update(trans, path, i, flags, ip);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bkey_i *k, enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_path *path = iter->update_path ?: iter->path;
|
||||
struct bkey_cached *ck;
|
||||
int ret;
|
||||
|
||||
if (iter->flags & BTREE_ITER_IS_EXTENTS)
|
||||
return bch2_trans_update_extent(trans, iter, k, flags);
|
||||
|
||||
if (bkey_deleted(&k->k) &&
|
||||
!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
|
||||
(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
|
||||
ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
|
||||
if (ret)
|
||||
k->k.type = KEY_TYPE_whiteout;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that updates to cached btrees go to the key cache:
|
||||
*/
|
||||
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
|
||||
!path->cached &&
|
||||
!path->level &&
|
||||
btree_id_cached(trans->c, path->btree_id)) {
|
||||
if (!iter->key_cache_path ||
|
||||
!iter->key_cache_path->should_be_locked ||
|
||||
!bpos_eq(iter->key_cache_path->pos, k->k.p)) {
|
||||
if (!iter->key_cache_path)
|
||||
iter->key_cache_path =
|
||||
bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
|
||||
BTREE_ITER_INTENT|
|
||||
BTREE_ITER_CACHED, _THIS_IP_);
|
||||
|
||||
iter->key_cache_path =
|
||||
bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
|
||||
iter->flags & BTREE_ITER_INTENT,
|
||||
_THIS_IP_);
|
||||
|
||||
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
|
||||
BTREE_ITER_CACHED);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
ck = (void *) iter->key_cache_path->l[0].b;
|
||||
|
||||
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
||||
trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
|
||||
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(iter->key_cache_path);
|
||||
}
|
||||
|
||||
path = iter->key_cache_path;
|
||||
}
|
||||
|
||||
return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a transaction update for a key that has already been journaled.
|
||||
*/
|
||||
int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
|
||||
struct btree_iter *iter, struct bkey_i *k,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
trans->journal_res.seq = seq;
|
||||
return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
|
||||
BTREE_UPDATE_PREJOURNAL);
|
||||
}
|
||||
|
||||
int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
|
||||
enum btree_id btree,
|
||||
struct bkey_i *k)
|
||||
{
|
||||
struct btree_write_buffered_key *i;
|
||||
int ret;
|
||||
|
||||
EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
|
||||
EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
|
||||
|
||||
trans_for_each_wb_update(trans, i) {
|
||||
if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
|
||||
bkey_copy(&i->k, k);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!trans->wb_updates ||
|
||||
trans->nr_wb_updates == trans->wb_updates_size) {
|
||||
struct btree_write_buffered_key *u;
|
||||
|
||||
if (trans->nr_wb_updates == trans->wb_updates_size) {
|
||||
struct btree_transaction_stats *s = btree_trans_stats(trans);
|
||||
|
||||
BUG_ON(trans->wb_updates_size > U8_MAX / 2);
|
||||
trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
|
||||
if (s)
|
||||
s->wb_updates_size = trans->wb_updates_size;
|
||||
}
|
||||
|
||||
u = bch2_trans_kmalloc_nomemzero(trans,
|
||||
trans->wb_updates_size *
|
||||
sizeof(struct btree_write_buffered_key));
|
||||
ret = PTR_ERR_OR_ZERO(u);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (trans->nr_wb_updates)
|
||||
memcpy(u, trans->wb_updates, trans->nr_wb_updates *
|
||||
sizeof(struct btree_write_buffered_key));
|
||||
trans->wb_updates = u;
|
||||
}
|
||||
|
||||
trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
|
||||
.btree = btree,
|
||||
};
|
||||
|
||||
bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
|
||||
trans->nr_wb_updates++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
|
||||
enum btree_id btree, struct bpos end)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
|
||||
k = bch2_btree_iter_prev(iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_btree_iter_advance(iter);
|
||||
k = bch2_btree_iter_peek_slot(iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
BUG_ON(k.k->type != KEY_TYPE_deleted);
|
||||
|
||||
if (bkey_gt(k.k->p, end)) {
|
||||
ret = -BCH_ERR_ENOSPC_btree_slot;
|
||||
goto err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_trans_commit_hook(struct btree_trans *trans,
|
||||
struct btree_trans_commit_hook *h)
|
||||
{
|
||||
h->next = trans->hooks;
|
||||
trans->hooks = h;
|
||||
}
|
||||
|
||||
int bch2_btree_insert_nonextent(struct btree_trans *trans,
|
||||
enum btree_id btree, struct bkey_i *k,
|
||||
enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, btree, k->k.p,
|
||||
BTREE_ITER_NOT_EXTENTS|
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bch2_btree_iter_traverse(&iter) ?:
|
||||
bch2_trans_update(trans, &iter, k, flags);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
|
||||
struct bkey_i *k, enum btree_update_flags flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
|
||||
BTREE_ITER_CACHED|
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bch2_btree_iter_traverse(&iter) ?:
|
||||
bch2_trans_update(trans, &iter, k, flags);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_insert - insert keys into the extent btree
|
||||
* @c: pointer to struct bch_fs
|
||||
* @id: btree to insert into
|
||||
* @insert_keys: list of keys to insert
|
||||
* @hook: insert callback
|
||||
*/
|
||||
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
|
||||
struct bkey_i *k,
|
||||
struct disk_reservation *disk_res,
|
||||
u64 *journal_seq, int flags)
|
||||
{
|
||||
return bch2_trans_do(c, disk_res, journal_seq, flags,
|
||||
__bch2_btree_insert(&trans, id, k, 0));
|
||||
}
|
||||
|
||||
int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
|
||||
unsigned len, unsigned update_flags)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
||||
k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.p = iter->pos;
|
||||
bch2_key_resize(&k->k, len);
|
||||
return bch2_trans_update(trans, iter, k, update_flags);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_at(struct btree_trans *trans,
|
||||
struct btree_iter *iter, unsigned update_flags)
|
||||
{
|
||||
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_at_buffered(struct btree_trans *trans,
|
||||
enum btree_id btree, struct bpos pos)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
||||
k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.p = pos;
|
||||
return bch2_trans_update_buffered(trans, btree, k);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
|
||||
struct bpos start, struct bpos end,
|
||||
unsigned update_flags,
|
||||
u64 *journal_seq)
|
||||
{
|
||||
u32 restart_count = trans->restart_count;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
|
||||
while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
|
||||
struct disk_reservation disk_res =
|
||||
bch2_disk_reservation_init(trans->c, 0);
|
||||
struct bkey_i delete;
|
||||
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bkey_init(&delete.k);
|
||||
|
||||
/*
|
||||
* This could probably be more efficient for extents:
|
||||
*/
|
||||
|
||||
/*
|
||||
* For extents, iter.pos won't necessarily be the same as
|
||||
* bkey_start_pos(k.k) (for non extents they always will be the
|
||||
* same). It's important that we delete starting from iter.pos
|
||||
* because the range we want to delete could start in the middle
|
||||
* of k.
|
||||
*
|
||||
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
|
||||
* bkey_start_pos(k.k)).
|
||||
*/
|
||||
delete.k.p = iter.pos;
|
||||
|
||||
if (iter.flags & BTREE_ITER_IS_EXTENTS)
|
||||
bch2_key_resize(&delete.k,
|
||||
bpos_min(end, k.k->p).offset -
|
||||
iter.pos.offset);
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
|
||||
bch2_trans_commit(trans, &disk_res, journal_seq,
|
||||
BTREE_INSERT_NOFAIL);
|
||||
bch2_disk_reservation_put(trans->c, &disk_res);
|
||||
err:
|
||||
/*
|
||||
* the bch2_trans_begin() call is in a weird place because we
|
||||
* need to call it after every transaction commit, to avoid path
|
||||
* overflow, but don't want to call it if the delete operation
|
||||
* is a no-op and we have no work to do:
|
||||
*/
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
ret = 0;
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (!ret && trans_was_restarted(trans, restart_count))
|
||||
ret = -BCH_ERR_transaction_restart_nested;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* bch_btree_delete_range - delete everything within a given range
|
||||
*
|
||||
* Range is a half open interval - [start, end)
|
||||
*/
|
||||
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
|
||||
struct bpos start, struct bpos end,
|
||||
unsigned update_flags,
|
||||
u64 *journal_seq)
|
||||
{
|
||||
int ret = bch2_trans_run(c,
|
||||
bch2_btree_delete_range_trans(&trans, id, start, end,
|
||||
update_flags, journal_seq));
|
||||
if (ret == -BCH_ERR_transaction_restart_nested)
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
|
||||
struct bpos pos, bool set)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
int ret = 0;
|
||||
|
||||
k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
|
||||
ret = PTR_ERR_OR_ZERO(k);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
|
||||
k->k.p = pos;
|
||||
|
||||
return bch2_trans_update_buffered(trans, btree, k);
|
||||
}
|
||||
|
||||
static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
struct jset_entry_log *l;
|
||||
unsigned u64s;
|
||||
int ret;
|
||||
|
||||
prt_vprintf(&buf, fmt, args);
|
||||
ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
|
||||
|
||||
ret = darray_make_room(entries, jset_u64s(u64s));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
l = (void *) &darray_top(*entries);
|
||||
l->entry.u64s = cpu_to_le16(u64s);
|
||||
l->entry.btree_id = 0;
|
||||
l->entry.level = 1;
|
||||
l->entry.type = BCH_JSET_ENTRY_log;
|
||||
l->entry.pad[0] = 0;
|
||||
l->entry.pad[1] = 0;
|
||||
l->entry.pad[2] = 0;
|
||||
memcpy(l->d, buf.buf, buf.pos);
|
||||
while (buf.pos & 7)
|
||||
l->d[buf.pos++] = '\0';
|
||||
|
||||
entries->nr += jset_u64s(u64s);
|
||||
err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
|
||||
va_list args)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
|
||||
ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
|
||||
} else {
|
||||
ret = bch2_trans_do(c, NULL, NULL,
|
||||
BTREE_INSERT_LAZY_RW|commit_flags,
|
||||
__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
int ret;
|
||||
|
||||
va_start(args, fmt);
|
||||
ret = __bch2_fs_log_msg(c, 0, fmt, args);
|
||||
va_end(args);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use for logging messages during recovery to enable reserved space and avoid
|
||||
* blocking.
|
||||
*/
|
||||
int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
int ret;
|
||||
|
||||
va_start(args, fmt);
|
||||
ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
|
||||
va_end(args);
|
||||
return ret;
|
||||
}
|
@ -5,6 +5,7 @@
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_io.h"
|
||||
@ -17,7 +18,6 @@
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
#include "trace.h"
|
||||
|
@ -10,7 +10,31 @@
|
||||
|
||||
#include "buckets_types.h"
|
||||
#include "extents.h"
|
||||
#include "super.h"
|
||||
#include "sb-members.h"
|
||||
|
||||
static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
|
||||
{
|
||||
return div_u64(s, ca->mi.bucket_size);
|
||||
}
|
||||
|
||||
static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
|
||||
{
|
||||
return ((sector_t) b) * ca->mi.bucket_size;
|
||||
}
|
||||
|
||||
static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
|
||||
{
|
||||
u32 remainder;
|
||||
|
||||
div_u64_rem(s, ca->mi.bucket_size, &remainder);
|
||||
return remainder;
|
||||
}
|
||||
|
||||
static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
|
||||
u32 *offset)
|
||||
{
|
||||
return div_u64_rem(s, ca->mi.bucket_size, offset);
|
||||
}
|
||||
|
||||
#define for_each_bucket(_b, _buckets) \
|
||||
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
|
||||
@ -292,6 +316,27 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
|
||||
size_t, enum bch_data_type, unsigned);
|
||||
int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
|
||||
{
|
||||
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
||||
u64 b_offset = bucket_to_sector(ca, b);
|
||||
u64 b_end = bucket_to_sector(ca, b + 1);
|
||||
unsigned i;
|
||||
|
||||
if (!b)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < layout->nr_superblocks; i++) {
|
||||
u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
||||
u64 end = offset + (1 << layout->sb_max_size_bits);
|
||||
|
||||
if (!(offset >= b_end || end <= b_offset))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* disk reservations: */
|
||||
|
||||
static inline void bch2_disk_reservation_put(struct bch_fs *c,
|
||||
|
@ -360,7 +360,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
|
||||
|
||||
state.type = type;
|
||||
bch2_checksum_init(&state);
|
||||
state.seed = a.lo;
|
||||
state.seed = (u64 __force) a.lo;
|
||||
|
||||
BUG_ON(!bch2_checksum_mergeable(type));
|
||||
|
||||
@ -371,7 +371,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
|
||||
page_address(ZERO_PAGE(0)), b);
|
||||
b_len -= b;
|
||||
}
|
||||
a.lo = bch2_checksum_final(&state);
|
||||
a.lo = (__le64 __force) bch2_checksum_final(&state);
|
||||
a.lo ^= b.lo;
|
||||
a.hi ^= b.hi;
|
||||
return a;
|
||||
@ -426,7 +426,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
||||
merged = bch2_checksum_bio(c, crc_old.csum_type,
|
||||
extent_nonce(version, crc_old), bio);
|
||||
|
||||
if (bch2_crc_cmp(merged, crc_old.csum)) {
|
||||
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
|
||||
bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
|
||||
"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
|
||||
crc_old.csum.hi,
|
||||
@ -458,6 +458,48 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* BCH_SB_FIELD_crypt: */
|
||||
|
||||
static int bch2_sb_crypt_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f,
|
||||
struct printbuf *err)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
|
||||
|
||||
if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
|
||||
prt_printf(err, "wrong size (got %zu should be %zu)",
|
||||
vstruct_bytes(&crypt->field), sizeof(*crypt));
|
||||
return -BCH_ERR_invalid_sb_crypt;
|
||||
}
|
||||
|
||||
if (BCH_CRYPT_KDF_TYPE(crypt)) {
|
||||
prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
|
||||
return -BCH_ERR_invalid_sb_crypt;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
|
||||
|
||||
prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
|
||||
.validate = bch2_sb_crypt_validate,
|
||||
.to_text = bch2_sb_crypt_to_text,
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
static int __bch2_request_key(char *key_description, struct bch_key *key)
|
||||
{
|
||||
@ -597,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
crypt->key.magic = BCH_KEY_MAGIC;
|
||||
crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC);
|
||||
crypt->key.key = key;
|
||||
|
||||
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
|
||||
@ -625,7 +667,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
key.magic = BCH_KEY_MAGIC;
|
||||
key.magic = cpu_to_le64(BCH_KEY_MAGIC);
|
||||
get_random_bytes(&key.key, sizeof(key.key));
|
||||
|
||||
if (keyed) {
|
||||
|
@ -72,6 +72,8 @@ static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
: 0;
|
||||
}
|
||||
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
|
||||
|
||||
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
|
||||
struct bch_key *);
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "disk_groups.h"
|
||||
#include "sb-members.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/sort.h>
|
||||
|
@ -213,6 +213,12 @@
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
|
||||
x(BCH_ERR_invalid, invalid_bkey) \
|
||||
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
|
||||
x(EIO, btree_node_read_err) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible)
|
||||
|
||||
enum bch_errcode {
|
||||
BCH_ERR_START = 2048,
|
||||
|
@ -517,13 +517,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
|
||||
switch (type) {
|
||||
case BCH_EXTENT_ENTRY_crc32:
|
||||
set_common_fields(dst->crc32, src);
|
||||
memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum));
|
||||
dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
|
||||
break;
|
||||
case BCH_EXTENT_ENTRY_crc64:
|
||||
set_common_fields(dst->crc64, src);
|
||||
dst->crc64.nonce = src.nonce;
|
||||
dst->crc64.csum_lo = src.csum.lo;
|
||||
dst->crc64.csum_hi = *((__le16 *) &src.csum.hi);
|
||||
dst->crc64.csum_lo = (u64 __force) src.csum.lo;
|
||||
dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
|
||||
break;
|
||||
case BCH_EXTENT_ENTRY_crc128:
|
||||
set_common_fields(dst->crc128, src);
|
||||
|
@ -155,7 +155,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
|
||||
common_fields(crc->crc32),
|
||||
};
|
||||
|
||||
memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum));
|
||||
*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
|
||||
return ret;
|
||||
}
|
||||
case BCH_EXTENT_ENTRY_crc64: {
|
||||
@ -165,8 +165,8 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
|
||||
.csum.lo = (__force __le64) crc->crc64.csum_lo,
|
||||
};
|
||||
|
||||
u16 hi = crc->crc64.csum_hi;
|
||||
memcpy(&ret.csum.hi, &hi, sizeof(hi));
|
||||
*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
|
||||
|
||||
return ret;
|
||||
}
|
||||
case BCH_EXTENT_ENTRY_crc128: {
|
||||
|
1102
libbcachefs/fs-io-buffered.c
Normal file
1102
libbcachefs/fs-io-buffered.c
Normal file
File diff suppressed because it is too large
Load Diff
28
libbcachefs/fs-io-buffered.h
Normal file
28
libbcachefs/fs-io-buffered.h
Normal file
@ -0,0 +1,28 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FS_IO_BUFFERED_H
|
||||
#define _BCACHEFS_FS_IO_BUFFERED_H
|
||||
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
int bch2_read_single_folio(struct folio *, struct address_space *);
|
||||
int bch2_read_folio(struct file *, struct folio *);
|
||||
|
||||
int bch2_writepages(struct address_space *, struct writeback_control *);
|
||||
void bch2_readahead(struct readahead_control *);
|
||||
|
||||
int bch2_write_begin(struct file *, struct address_space *, loff_t,
|
||||
unsigned, struct page **, void **);
|
||||
int bch2_write_end(struct file *, struct address_space *, loff_t,
|
||||
unsigned, unsigned, struct page *, void *);
|
||||
|
||||
ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
|
||||
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
|
||||
|
||||
void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
|
||||
int bch2_fs_fs_io_buffered_init(struct bch_fs *);
|
||||
#else
|
||||
static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
|
678
libbcachefs/fs-io-direct.c
Normal file
678
libbcachefs/fs-io-direct.c
Normal file
@ -0,0 +1,678 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc_foreground.h"
|
||||
#include "fs.h"
|
||||
#include "fs-io.h"
|
||||
#include "fs-io-direct.h"
|
||||
#include "fs-io-pagecache.h"
|
||||
#include "io.h"
|
||||
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
|
||||
/* O_DIRECT reads */
|
||||
|
||||
struct dio_read {
|
||||
struct closure cl;
|
||||
struct kiocb *req;
|
||||
long ret;
|
||||
bool should_dirty;
|
||||
struct bch_read_bio rbio;
|
||||
};
|
||||
|
||||
static void bio_check_or_release(struct bio *bio, bool check_dirty)
|
||||
{
|
||||
if (check_dirty) {
|
||||
bio_check_pages_dirty(bio);
|
||||
} else {
|
||||
bio_release_pages(bio, false);
|
||||
bio_put(bio);
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_dio_read_complete(struct closure *cl)
|
||||
{
|
||||
struct dio_read *dio = container_of(cl, struct dio_read, cl);
|
||||
|
||||
dio->req->ki_complete(dio->req, dio->ret);
|
||||
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
|
||||
}
|
||||
|
||||
static void bch2_direct_IO_read_endio(struct bio *bio)
|
||||
{
|
||||
struct dio_read *dio = bio->bi_private;
|
||||
|
||||
if (bio->bi_status)
|
||||
dio->ret = blk_status_to_errno(bio->bi_status);
|
||||
|
||||
closure_put(&dio->cl);
|
||||
}
|
||||
|
||||
static void bch2_direct_IO_read_split_endio(struct bio *bio)
|
||||
{
|
||||
struct dio_read *dio = bio->bi_private;
|
||||
bool should_dirty = dio->should_dirty;
|
||||
|
||||
bch2_direct_IO_read_endio(bio);
|
||||
bio_check_or_release(bio, should_dirty);
|
||||
}
|
||||
|
||||
static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
||||
{
|
||||
struct file *file = req->ki_filp;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct bch_io_opts opts;
|
||||
struct dio_read *dio;
|
||||
struct bio *bio;
|
||||
loff_t offset = req->ki_pos;
|
||||
bool sync = is_sync_kiocb(req);
|
||||
size_t shorten;
|
||||
ssize_t ret;
|
||||
|
||||
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
|
||||
|
||||
if ((offset|iter->count) & (block_bytes(c) - 1))
|
||||
return -EINVAL;
|
||||
|
||||
ret = min_t(loff_t, iter->count,
|
||||
max_t(loff_t, 0, i_size_read(&inode->v) - offset));
|
||||
|
||||
if (!ret)
|
||||
return ret;
|
||||
|
||||
shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
|
||||
iter->count -= shorten;
|
||||
|
||||
bio = bio_alloc_bioset(NULL,
|
||||
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
|
||||
REQ_OP_READ,
|
||||
GFP_KERNEL,
|
||||
&c->dio_read_bioset);
|
||||
|
||||
bio->bi_end_io = bch2_direct_IO_read_endio;
|
||||
|
||||
dio = container_of(bio, struct dio_read, rbio.bio);
|
||||
closure_init(&dio->cl, NULL);
|
||||
|
||||
/*
|
||||
* this is a _really_ horrible hack just to avoid an atomic sub at the
|
||||
* end:
|
||||
*/
|
||||
if (!sync) {
|
||||
set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
|
||||
atomic_set(&dio->cl.remaining,
|
||||
CLOSURE_REMAINING_INITIALIZER -
|
||||
CLOSURE_RUNNING +
|
||||
CLOSURE_DESTRUCTOR);
|
||||
} else {
|
||||
atomic_set(&dio->cl.remaining,
|
||||
CLOSURE_REMAINING_INITIALIZER + 1);
|
||||
}
|
||||
|
||||
dio->req = req;
|
||||
dio->ret = ret;
|
||||
/*
|
||||
* This is one of the sketchier things I've encountered: we have to skip
|
||||
* the dirtying of requests that are internal from the kernel (i.e. from
|
||||
* loopback), because we'll deadlock on page_lock.
|
||||
*/
|
||||
dio->should_dirty = iter_is_iovec(iter);
|
||||
|
||||
goto start;
|
||||
while (iter->count) {
|
||||
bio = bio_alloc_bioset(NULL,
|
||||
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
|
||||
REQ_OP_READ,
|
||||
GFP_KERNEL,
|
||||
&c->bio_read);
|
||||
bio->bi_end_io = bch2_direct_IO_read_split_endio;
|
||||
start:
|
||||
bio->bi_opf = REQ_OP_READ|REQ_SYNC;
|
||||
bio->bi_iter.bi_sector = offset >> 9;
|
||||
bio->bi_private = dio;
|
||||
|
||||
ret = bio_iov_iter_get_pages(bio, iter);
|
||||
if (ret < 0) {
|
||||
/* XXX: fault inject this path */
|
||||
bio->bi_status = BLK_STS_RESOURCE;
|
||||
bio_endio(bio);
|
||||
break;
|
||||
}
|
||||
|
||||
offset += bio->bi_iter.bi_size;
|
||||
|
||||
if (dio->should_dirty)
|
||||
bio_set_pages_dirty(bio);
|
||||
|
||||
if (iter->count)
|
||||
closure_get(&dio->cl);
|
||||
|
||||
bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
|
||||
}
|
||||
|
||||
iter->count += shorten;
|
||||
|
||||
if (sync) {
|
||||
closure_sync(&dio->cl);
|
||||
closure_debug_destroy(&dio->cl);
|
||||
ret = dio->ret;
|
||||
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
|
||||
return ret;
|
||||
} else {
|
||||
return -EIOCBQUEUED;
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
size_t count = iov_iter_count(iter);
|
||||
ssize_t ret;
|
||||
|
||||
if (!count)
|
||||
return 0; /* skip atime */
|
||||
|
||||
if (iocb->ki_flags & IOCB_DIRECT) {
|
||||
struct blk_plug plug;
|
||||
|
||||
if (unlikely(mapping->nrpages)) {
|
||||
ret = filemap_write_and_wait_range(mapping,
|
||||
iocb->ki_pos,
|
||||
iocb->ki_pos + count - 1);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
file_accessed(file);
|
||||
|
||||
blk_start_plug(&plug);
|
||||
ret = bch2_direct_IO_read(iocb, iter);
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
if (ret >= 0)
|
||||
iocb->ki_pos += ret;
|
||||
} else {
|
||||
bch2_pagecache_add_get(inode);
|
||||
ret = generic_file_read_iter(iocb, iter);
|
||||
bch2_pagecache_add_put(inode);
|
||||
}
|
||||
out:
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
/* O_DIRECT writes */
|
||||
|
||||
struct dio_write {
|
||||
struct kiocb *req;
|
||||
struct address_space *mapping;
|
||||
struct bch_inode_info *inode;
|
||||
struct mm_struct *mm;
|
||||
unsigned loop:1,
|
||||
extending:1,
|
||||
sync:1,
|
||||
flush:1,
|
||||
free_iov:1;
|
||||
struct quota_res quota_res;
|
||||
u64 written;
|
||||
|
||||
struct iov_iter iter;
|
||||
struct iovec inline_vecs[2];
|
||||
|
||||
/* must be last: */
|
||||
struct bch_write_op op;
|
||||
};
|
||||
|
||||
static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
|
||||
u64 offset, u64 size,
|
||||
unsigned nr_replicas, bool compressed)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u64 end = offset + size;
|
||||
u32 snapshot;
|
||||
bool ret = true;
|
||||
int err;
|
||||
|
||||
bch2_trans_init(&trans, c, 0, 0);
|
||||
retry:
|
||||
bch2_trans_begin(&trans);
|
||||
|
||||
err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
|
||||
SPOS(inum.inum, offset, snapshot),
|
||||
BTREE_ITER_SLOTS, k, err) {
|
||||
if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
|
||||
break;
|
||||
|
||||
if (k.k->p.snapshot != snapshot ||
|
||||
nr_replicas > bch2_bkey_replicas(c, k) ||
|
||||
(!compressed && bch2_bkey_sectors_compressed(k))) {
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
offset = iter.pos.offset;
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(err, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
bch2_trans_exit(&trans);
|
||||
|
||||
return err ? false : ret;
|
||||
}
|
||||
|
||||
static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
|
||||
{
|
||||
struct bch_fs *c = dio->op.c;
|
||||
struct bch_inode_info *inode = dio->inode;
|
||||
struct bio *bio = &dio->op.wbio.bio;
|
||||
|
||||
return bch2_check_range_allocated(c, inode_inum(inode),
|
||||
dio->op.pos.offset, bio_sectors(bio),
|
||||
dio->op.opts.data_replicas,
|
||||
dio->op.opts.compression != 0);
|
||||
}
|
||||
|
||||
static void bch2_dio_write_loop_async(struct bch_write_op *);
|
||||
static __always_inline long bch2_dio_write_done(struct dio_write *dio);
|
||||
|
||||
/*
|
||||
* We're going to return -EIOCBQUEUED, but we haven't finished consuming the
|
||||
* iov_iter yet, so we need to stash a copy of the iovec: it might be on the
|
||||
* caller's stack, we're not guaranteed that it will live for the duration of
|
||||
* the IO:
|
||||
*/
|
||||
static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
|
||||
{
|
||||
struct iovec *iov = dio->inline_vecs;
|
||||
|
||||
/*
|
||||
* iov_iter has a single embedded iovec - nothing to do:
|
||||
*/
|
||||
if (iter_is_ubuf(&dio->iter))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We don't currently handle non-iovec iov_iters here - return an error,
|
||||
* and we'll fall back to doing the IO synchronously:
|
||||
*/
|
||||
if (!iter_is_iovec(&dio->iter))
|
||||
return -1;
|
||||
|
||||
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
|
||||
iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
|
||||
GFP_KERNEL);
|
||||
if (unlikely(!iov))
|
||||
return -ENOMEM;
|
||||
|
||||
dio->free_iov = true;
|
||||
}
|
||||
|
||||
memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
|
||||
dio->iter.__iov = iov;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_dio_write_flush_done(struct closure *cl)
|
||||
{
|
||||
struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
|
||||
struct bch_fs *c = dio->op.c;
|
||||
|
||||
closure_debug_destroy(cl);
|
||||
|
||||
dio->op.error = bch2_journal_error(&c->journal);
|
||||
|
||||
bch2_dio_write_done(dio);
|
||||
}
|
||||
|
||||
static noinline void bch2_dio_write_flush(struct dio_write *dio)
|
||||
{
|
||||
struct bch_fs *c = dio->op.c;
|
||||
struct bch_inode_unpacked inode;
|
||||
int ret;
|
||||
|
||||
dio->flush = 0;
|
||||
|
||||
closure_init(&dio->op.cl, NULL);
|
||||
|
||||
if (!dio->op.error) {
|
||||
ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
|
||||
if (ret) {
|
||||
dio->op.error = ret;
|
||||
} else {
|
||||
bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
|
||||
bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
|
||||
}
|
||||
}
|
||||
|
||||
if (dio->sync) {
|
||||
closure_sync(&dio->op.cl);
|
||||
closure_debug_destroy(&dio->op.cl);
|
||||
} else {
|
||||
continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline long bch2_dio_write_done(struct dio_write *dio)
|
||||
{
|
||||
struct kiocb *req = dio->req;
|
||||
struct bch_inode_info *inode = dio->inode;
|
||||
bool sync = dio->sync;
|
||||
long ret;
|
||||
|
||||
if (unlikely(dio->flush)) {
|
||||
bch2_dio_write_flush(dio);
|
||||
if (!sync)
|
||||
return -EIOCBQUEUED;
|
||||
}
|
||||
|
||||
bch2_pagecache_block_put(inode);
|
||||
|
||||
if (dio->free_iov)
|
||||
kfree(dio->iter.__iov);
|
||||
|
||||
ret = dio->op.error ?: ((long) dio->written << 9);
|
||||
bio_put(&dio->op.wbio.bio);
|
||||
|
||||
/* inode->i_dio_count is our ref on inode and thus bch_fs */
|
||||
inode_dio_end(&inode->v);
|
||||
|
||||
if (ret < 0)
|
||||
ret = bch2_err_class(ret);
|
||||
|
||||
if (!sync) {
|
||||
req->ki_complete(req, ret);
|
||||
ret = -EIOCBQUEUED;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __always_inline void bch2_dio_write_end(struct dio_write *dio)
|
||||
{
|
||||
struct bch_fs *c = dio->op.c;
|
||||
struct kiocb *req = dio->req;
|
||||
struct bch_inode_info *inode = dio->inode;
|
||||
struct bio *bio = &dio->op.wbio.bio;
|
||||
|
||||
req->ki_pos += (u64) dio->op.written << 9;
|
||||
dio->written += dio->op.written;
|
||||
|
||||
if (dio->extending) {
|
||||
spin_lock(&inode->v.i_lock);
|
||||
if (req->ki_pos > inode->v.i_size)
|
||||
i_size_write(&inode->v, req->ki_pos);
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
}
|
||||
|
||||
if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
|
||||
__bch2_quota_reservation_put(c, inode, &dio->quota_res);
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
}
|
||||
|
||||
bio_release_pages(bio, false);
|
||||
|
||||
if (unlikely(dio->op.error))
|
||||
set_bit(EI_INODE_ERROR, &inode->ei_flags);
|
||||
}
|
||||
|
||||
static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
|
||||
{
|
||||
struct bch_fs *c = dio->op.c;
|
||||
struct kiocb *req = dio->req;
|
||||
struct address_space *mapping = dio->mapping;
|
||||
struct bch_inode_info *inode = dio->inode;
|
||||
struct bch_io_opts opts;
|
||||
struct bio *bio = &dio->op.wbio.bio;
|
||||
unsigned unaligned, iter_count;
|
||||
bool sync = dio->sync, dropped_locks;
|
||||
long ret;
|
||||
|
||||
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
|
||||
|
||||
while (1) {
|
||||
iter_count = dio->iter.count;
|
||||
|
||||
EBUG_ON(current->faults_disabled_mapping);
|
||||
current->faults_disabled_mapping = mapping;
|
||||
|
||||
ret = bio_iov_iter_get_pages(bio, &dio->iter);
|
||||
|
||||
dropped_locks = fdm_dropped_locks();
|
||||
|
||||
current->faults_disabled_mapping = NULL;
|
||||
|
||||
/*
|
||||
* If the fault handler returned an error but also signalled
|
||||
* that it dropped & retook ei_pagecache_lock, we just need to
|
||||
* re-shoot down the page cache and retry:
|
||||
*/
|
||||
if (dropped_locks && ret)
|
||||
ret = 0;
|
||||
|
||||
if (unlikely(ret < 0))
|
||||
goto err;
|
||||
|
||||
if (unlikely(dropped_locks)) {
|
||||
ret = bch2_write_invalidate_inode_pages_range(mapping,
|
||||
req->ki_pos,
|
||||
req->ki_pos + iter_count - 1);
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
if (!bio->bi_iter.bi_size)
|
||||
continue;
|
||||
}
|
||||
|
||||
unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
|
||||
bio->bi_iter.bi_size -= unaligned;
|
||||
iov_iter_revert(&dio->iter, unaligned);
|
||||
|
||||
if (!bio->bi_iter.bi_size) {
|
||||
/*
|
||||
* bio_iov_iter_get_pages was only able to get <
|
||||
* blocksize worth of pages:
|
||||
*/
|
||||
ret = -EFAULT;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch2_write_op_init(&dio->op, c, opts);
|
||||
dio->op.end_io = sync
|
||||
? NULL
|
||||
: bch2_dio_write_loop_async;
|
||||
dio->op.target = dio->op.opts.foreground_target;
|
||||
dio->op.write_point = writepoint_hashed((unsigned long) current);
|
||||
dio->op.nr_replicas = dio->op.opts.data_replicas;
|
||||
dio->op.subvol = inode->ei_subvol;
|
||||
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
|
||||
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
|
||||
|
||||
if (sync)
|
||||
dio->op.flags |= BCH_WRITE_SYNC;
|
||||
dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
|
||||
|
||||
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
|
||||
bio_sectors(bio), true);
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
|
||||
dio->op.opts.data_replicas, 0);
|
||||
if (unlikely(ret) &&
|
||||
!bch2_dio_write_check_allocated(dio))
|
||||
goto err;
|
||||
|
||||
task_io_account_write(bio->bi_iter.bi_size);
|
||||
|
||||
if (unlikely(dio->iter.count) &&
|
||||
!dio->sync &&
|
||||
!dio->loop &&
|
||||
bch2_dio_write_copy_iov(dio))
|
||||
dio->sync = sync = true;
|
||||
|
||||
dio->loop = true;
|
||||
closure_call(&dio->op.cl, bch2_write, NULL, NULL);
|
||||
|
||||
if (!sync)
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
bch2_dio_write_end(dio);
|
||||
|
||||
if (likely(!dio->iter.count) || dio->op.error)
|
||||
break;
|
||||
|
||||
bio_reset(bio, NULL, REQ_OP_WRITE);
|
||||
}
|
||||
out:
|
||||
return bch2_dio_write_done(dio);
|
||||
err:
|
||||
dio->op.error = ret;
|
||||
|
||||
bio_release_pages(bio, false);
|
||||
|
||||
bch2_quota_reservation_put(c, inode, &dio->quota_res);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
|
||||
{
|
||||
struct mm_struct *mm = dio->mm;
|
||||
|
||||
bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
|
||||
|
||||
if (mm)
|
||||
kthread_use_mm(mm);
|
||||
bch2_dio_write_loop(dio);
|
||||
if (mm)
|
||||
kthread_unuse_mm(mm);
|
||||
}
|
||||
|
||||
static void bch2_dio_write_loop_async(struct bch_write_op *op)
|
||||
{
|
||||
struct dio_write *dio = container_of(op, struct dio_write, op);
|
||||
|
||||
bch2_dio_write_end(dio);
|
||||
|
||||
if (likely(!dio->iter.count) || dio->op.error)
|
||||
bch2_dio_write_done(dio);
|
||||
else
|
||||
bch2_dio_write_continue(dio);
|
||||
}
|
||||
|
||||
ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
|
||||
{
|
||||
struct file *file = req->ki_filp;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct dio_write *dio;
|
||||
struct bio *bio;
|
||||
bool locked = true, extending;
|
||||
ssize_t ret;
|
||||
|
||||
prefetch(&c->opts);
|
||||
prefetch((void *) &c->opts + 64);
|
||||
prefetch(&inode->ei_inode);
|
||||
prefetch((void *) &inode->ei_inode + 64);
|
||||
|
||||
inode_lock(&inode->v);
|
||||
|
||||
ret = generic_write_checks(req, iter);
|
||||
if (unlikely(ret <= 0))
|
||||
goto err;
|
||||
|
||||
ret = file_remove_privs(file);
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
ret = file_update_time(file);
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
|
||||
goto err;
|
||||
|
||||
inode_dio_begin(&inode->v);
|
||||
bch2_pagecache_block_get(inode);
|
||||
|
||||
extending = req->ki_pos + iter->count > inode->v.i_size;
|
||||
if (!extending) {
|
||||
inode_unlock(&inode->v);
|
||||
locked = false;
|
||||
}
|
||||
|
||||
bio = bio_alloc_bioset(NULL,
|
||||
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
|
||||
REQ_OP_WRITE,
|
||||
GFP_KERNEL,
|
||||
&c->dio_write_bioset);
|
||||
dio = container_of(bio, struct dio_write, op.wbio.bio);
|
||||
dio->req = req;
|
||||
dio->mapping = mapping;
|
||||
dio->inode = inode;
|
||||
dio->mm = current->mm;
|
||||
dio->loop = false;
|
||||
dio->extending = extending;
|
||||
dio->sync = is_sync_kiocb(req) || extending;
|
||||
dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
|
||||
dio->free_iov = false;
|
||||
dio->quota_res.sectors = 0;
|
||||
dio->written = 0;
|
||||
dio->iter = *iter;
|
||||
dio->op.c = c;
|
||||
|
||||
if (unlikely(mapping->nrpages)) {
|
||||
ret = bch2_write_invalidate_inode_pages_range(mapping,
|
||||
req->ki_pos,
|
||||
req->ki_pos + iter->count - 1);
|
||||
if (unlikely(ret))
|
||||
goto err_put_bio;
|
||||
}
|
||||
|
||||
ret = bch2_dio_write_loop(dio);
|
||||
err:
|
||||
if (locked)
|
||||
inode_unlock(&inode->v);
|
||||
return ret;
|
||||
err_put_bio:
|
||||
bch2_pagecache_block_put(inode);
|
||||
bio_put(bio);
|
||||
inode_dio_end(&inode->v);
|
||||
goto err;
|
||||
}
|
||||
|
||||
void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
|
||||
{
|
||||
bioset_exit(&c->dio_write_bioset);
|
||||
bioset_exit(&c->dio_read_bioset);
|
||||
}
|
||||
|
||||
int bch2_fs_fs_io_direct_init(struct bch_fs *c)
|
||||
{
|
||||
if (bioset_init(&c->dio_read_bioset,
|
||||
4, offsetof(struct dio_read, rbio.bio),
|
||||
BIOSET_NEED_BVECS))
|
||||
return -BCH_ERR_ENOMEM_dio_read_bioset_init;
|
||||
|
||||
if (bioset_init(&c->dio_write_bioset,
|
||||
4, offsetof(struct dio_write, op.wbio.bio),
|
||||
BIOSET_NEED_BVECS))
|
||||
return -BCH_ERR_ENOMEM_dio_write_bioset_init;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* NO_BCACHEFS_FS */
|
15
libbcachefs/fs-io-direct.h
Normal file
15
libbcachefs/fs-io-direct.h
Normal file
@ -0,0 +1,15 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FS_IO_DIRECT_H
|
||||
#define _BCACHEFS_FS_IO_DIRECT_H
|
||||
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
|
||||
|
||||
void bch2_fs_fs_io_direct_exit(struct bch_fs *);
|
||||
int bch2_fs_fs_io_direct_init(struct bch_fs *);
|
||||
#else
|
||||
static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_FS_IO_DIRECT_H */
|
777
libbcachefs/fs-io-pagecache.c
Normal file
777
libbcachefs/fs-io-pagecache.c
Normal file
@ -0,0 +1,777 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_iter.h"
|
||||
#include "extents.h"
|
||||
#include "fs-io.h"
|
||||
#include "fs-io-pagecache.h"
|
||||
#include "subvolume.h"
|
||||
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/writeback.h>
|
||||
|
||||
int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
|
||||
loff_t start, u64 end,
|
||||
int fgp_flags, gfp_t gfp,
|
||||
folios *folios)
|
||||
{
|
||||
struct folio *f;
|
||||
u64 pos = start;
|
||||
int ret = 0;
|
||||
|
||||
while (pos < end) {
|
||||
if ((u64) pos >= (u64) start + (1ULL << 20))
|
||||
fgp_flags &= ~FGP_CREAT;
|
||||
|
||||
ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
|
||||
if (IS_ERR_OR_NULL(f))
|
||||
break;
|
||||
|
||||
BUG_ON(folios->nr && folio_pos(f) != pos);
|
||||
|
||||
pos = folio_end_pos(f);
|
||||
darray_push(folios, f);
|
||||
}
|
||||
|
||||
if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
|
||||
ret = -ENOMEM;
|
||||
|
||||
return folios->nr ? 0 : ret;
|
||||
}
|
||||
|
||||
/* pagecache_block must be held */
|
||||
int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
|
||||
loff_t start, loff_t end)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* XXX: the way this is currently implemented, we can spin if a process
|
||||
* is continually redirtying a specific page
|
||||
*/
|
||||
do {
|
||||
if (!mapping->nrpages)
|
||||
return 0;
|
||||
|
||||
ret = filemap_write_and_wait_range(mapping, start, end);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (!mapping->nrpages)
|
||||
return 0;
|
||||
|
||||
ret = invalidate_inode_pages2_range(mapping,
|
||||
start >> PAGE_SHIFT,
|
||||
end >> PAGE_SHIFT);
|
||||
} while (ret == -EBUSY);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char * const bch2_folio_sector_states[] = {
|
||||
#define x(n) #n,
|
||||
BCH_FOLIO_SECTOR_STATE()
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
static inline enum bch_folio_sector_state
|
||||
folio_sector_dirty(enum bch_folio_sector_state state)
|
||||
{
|
||||
switch (state) {
|
||||
case SECTOR_unallocated:
|
||||
return SECTOR_dirty;
|
||||
case SECTOR_reserved:
|
||||
return SECTOR_dirty_reserved;
|
||||
default:
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
static inline enum bch_folio_sector_state
|
||||
folio_sector_undirty(enum bch_folio_sector_state state)
|
||||
{
|
||||
switch (state) {
|
||||
case SECTOR_dirty:
|
||||
return SECTOR_unallocated;
|
||||
case SECTOR_dirty_reserved:
|
||||
return SECTOR_reserved;
|
||||
default:
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
static inline enum bch_folio_sector_state
|
||||
folio_sector_reserve(enum bch_folio_sector_state state)
|
||||
{
|
||||
switch (state) {
|
||||
case SECTOR_unallocated:
|
||||
return SECTOR_reserved;
|
||||
case SECTOR_dirty:
|
||||
return SECTOR_dirty_reserved;
|
||||
default:
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
/* for newly allocated folios: */
|
||||
struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
|
||||
{
|
||||
struct bch_folio *s;
|
||||
|
||||
s = kzalloc(sizeof(*s) +
|
||||
sizeof(struct bch_folio_sector) *
|
||||
folio_sectors(folio), gfp);
|
||||
if (!s)
|
||||
return NULL;
|
||||
|
||||
spin_lock_init(&s->lock);
|
||||
folio_attach_private(folio, s);
|
||||
return s;
|
||||
}
|
||||
|
||||
struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
|
||||
{
|
||||
return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
|
||||
}
|
||||
|
||||
static unsigned bkey_to_sector_state(struct bkey_s_c k)
|
||||
{
|
||||
if (bkey_extent_is_reservation(k))
|
||||
return SECTOR_reserved;
|
||||
if (bkey_extent_is_allocation(k.k))
|
||||
return SECTOR_allocated;
|
||||
return SECTOR_unallocated;
|
||||
}
|
||||
|
||||
static void __bch2_folio_set(struct folio *folio,
|
||||
unsigned pg_offset, unsigned pg_len,
|
||||
unsigned nr_ptrs, unsigned state)
|
||||
{
|
||||
struct bch_folio *s = bch2_folio(folio);
|
||||
unsigned i, sectors = folio_sectors(folio);
|
||||
|
||||
BUG_ON(pg_offset >= sectors);
|
||||
BUG_ON(pg_offset + pg_len > sectors);
|
||||
|
||||
spin_lock(&s->lock);
|
||||
|
||||
for (i = pg_offset; i < pg_offset + pg_len; i++) {
|
||||
s->s[i].nr_replicas = nr_ptrs;
|
||||
bch2_folio_sector_set(folio, s, i, state);
|
||||
}
|
||||
|
||||
if (i == sectors)
|
||||
s->uptodate = true;
|
||||
|
||||
spin_unlock(&s->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
|
||||
* extents btree:
|
||||
*/
|
||||
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
|
||||
struct folio **folios, unsigned nr_folios)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bch_folio *s;
|
||||
u64 offset = folio_sector(folios[0]);
|
||||
unsigned folio_idx;
|
||||
u32 snapshot;
|
||||
bool need_set = false;
|
||||
int ret;
|
||||
|
||||
for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
|
||||
s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
|
||||
need_set |= !s->uptodate;
|
||||
}
|
||||
|
||||
if (!need_set)
|
||||
return 0;
|
||||
|
||||
folio_idx = 0;
|
||||
bch2_trans_init(&trans, c, 0, 0);
|
||||
retry:
|
||||
bch2_trans_begin(&trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
|
||||
SPOS(inum.inum, offset, snapshot),
|
||||
BTREE_ITER_SLOTS, k, ret) {
|
||||
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
|
||||
unsigned state = bkey_to_sector_state(k);
|
||||
|
||||
while (folio_idx < nr_folios) {
|
||||
struct folio *folio = folios[folio_idx];
|
||||
u64 folio_start = folio_sector(folio);
|
||||
u64 folio_end = folio_end_sector(folio);
|
||||
unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
|
||||
unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
|
||||
|
||||
BUG_ON(k.k->p.offset < folio_start);
|
||||
BUG_ON(bkey_start_offset(k.k) > folio_end);
|
||||
|
||||
if (!bch2_folio(folio)->uptodate)
|
||||
__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
|
||||
|
||||
if (k.k->p.offset < folio_end)
|
||||
break;
|
||||
folio_idx++;
|
||||
}
|
||||
|
||||
if (folio_idx == nr_folios)
|
||||
break;
|
||||
}
|
||||
|
||||
offset = iter.pos.offset;
|
||||
bch2_trans_iter_exit(&trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
bch2_trans_exit(&trans);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct folio_vec fv;
|
||||
unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
|
||||
? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
|
||||
unsigned state = bkey_to_sector_state(k);
|
||||
|
||||
bio_for_each_folio(fv, bio, iter)
|
||||
__bch2_folio_set(fv.fv_folio,
|
||||
fv.fv_offset >> 9,
|
||||
fv.fv_len >> 9,
|
||||
nr_ptrs, state);
|
||||
}
|
||||
|
||||
void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
|
||||
u64 start, u64 end)
|
||||
{
|
||||
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
|
||||
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
|
||||
struct folio_batch fbatch;
|
||||
unsigned i, j;
|
||||
|
||||
if (end <= start)
|
||||
return;
|
||||
|
||||
folio_batch_init(&fbatch);
|
||||
|
||||
while (filemap_get_folios(inode->v.i_mapping,
|
||||
&index, end_index, &fbatch)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
u64 folio_start = folio_sector(folio);
|
||||
u64 folio_end = folio_end_sector(folio);
|
||||
unsigned folio_offset = max(start, folio_start) - folio_start;
|
||||
unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
|
||||
struct bch_folio *s;
|
||||
|
||||
BUG_ON(end <= folio_start);
|
||||
|
||||
folio_lock(folio);
|
||||
s = bch2_folio(folio);
|
||||
|
||||
if (s) {
|
||||
spin_lock(&s->lock);
|
||||
for (j = folio_offset; j < folio_offset + folio_len; j++)
|
||||
s->s[j].nr_replicas = 0;
|
||||
spin_unlock(&s->lock);
|
||||
}
|
||||
|
||||
folio_unlock(folio);
|
||||
}
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
|
||||
u64 start, u64 end)
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
|
||||
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
|
||||
struct folio_batch fbatch;
|
||||
s64 i_sectors_delta = 0;
|
||||
unsigned i, j;
|
||||
|
||||
if (end <= start)
|
||||
return;
|
||||
|
||||
folio_batch_init(&fbatch);
|
||||
|
||||
while (filemap_get_folios(inode->v.i_mapping,
|
||||
&index, end_index, &fbatch)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
u64 folio_start = folio_sector(folio);
|
||||
u64 folio_end = folio_end_sector(folio);
|
||||
unsigned folio_offset = max(start, folio_start) - folio_start;
|
||||
unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
|
||||
struct bch_folio *s;
|
||||
|
||||
BUG_ON(end <= folio_start);
|
||||
|
||||
folio_lock(folio);
|
||||
s = bch2_folio(folio);
|
||||
|
||||
if (s) {
|
||||
spin_lock(&s->lock);
|
||||
for (j = folio_offset; j < folio_offset + folio_len; j++) {
|
||||
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
|
||||
bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
|
||||
}
|
||||
spin_unlock(&s->lock);
|
||||
}
|
||||
|
||||
folio_unlock(folio);
|
||||
}
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
|
||||
}
|
||||
|
||||
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
|
||||
unsigned nr_replicas)
|
||||
{
|
||||
return max(0, (int) nr_replicas -
|
||||
s->nr_replicas -
|
||||
s->replicas_reserved);
|
||||
}
|
||||
|
||||
int bch2_get_folio_disk_reservation(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct folio *folio, bool check_enospc)
|
||||
{
|
||||
struct bch_folio *s = bch2_folio_create(folio, 0);
|
||||
unsigned nr_replicas = inode_nr_replicas(c, inode);
|
||||
struct disk_reservation disk_res = { 0 };
|
||||
unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
|
||||
int ret;
|
||||
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < sectors; i++)
|
||||
disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
|
||||
|
||||
if (!disk_res_sectors)
|
||||
return 0;
|
||||
|
||||
ret = bch2_disk_reservation_get(c, &disk_res,
|
||||
disk_res_sectors, 1,
|
||||
!check_enospc
|
||||
? BCH_DISK_RESERVATION_NOFAIL
|
||||
: 0);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
for (i = 0; i < sectors; i++)
|
||||
s->s[i].replicas_reserved +=
|
||||
sectors_to_reserve(&s->s[i], nr_replicas);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_folio_reservation_put(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct bch2_folio_reservation *res)
|
||||
{
|
||||
bch2_disk_reservation_put(c, &res->disk);
|
||||
bch2_quota_reservation_put(c, inode, &res->quota);
|
||||
}
|
||||
|
||||
int bch2_folio_reservation_get(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct folio *folio,
|
||||
struct bch2_folio_reservation *res,
|
||||
unsigned offset, unsigned len)
|
||||
{
|
||||
struct bch_folio *s = bch2_folio_create(folio, 0);
|
||||
unsigned i, disk_sectors = 0, quota_sectors = 0;
|
||||
int ret;
|
||||
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
|
||||
BUG_ON(!s->uptodate);
|
||||
|
||||
for (i = round_down(offset, block_bytes(c)) >> 9;
|
||||
i < round_up(offset + len, block_bytes(c)) >> 9;
|
||||
i++) {
|
||||
disk_sectors += sectors_to_reserve(&s->s[i],
|
||||
res->disk.nr_replicas);
|
||||
quota_sectors += s->s[i].state == SECTOR_unallocated;
|
||||
}
|
||||
|
||||
if (disk_sectors) {
|
||||
ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (quota_sectors) {
|
||||
ret = bch2_quota_reservation_add(c, inode, &res->quota,
|
||||
quota_sectors, true);
|
||||
if (unlikely(ret)) {
|
||||
struct disk_reservation tmp = {
|
||||
.sectors = disk_sectors
|
||||
};
|
||||
|
||||
bch2_disk_reservation_put(c, &tmp);
|
||||
res->disk.sectors -= disk_sectors;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_clear_folio_bits(struct folio *folio)
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct bch_folio *s = bch2_folio(folio);
|
||||
struct disk_reservation disk_res = { 0 };
|
||||
int i, sectors = folio_sectors(folio), dirty_sectors = 0;
|
||||
|
||||
if (!s)
|
||||
return;
|
||||
|
||||
EBUG_ON(!folio_test_locked(folio));
|
||||
EBUG_ON(folio_test_writeback(folio));
|
||||
|
||||
for (i = 0; i < sectors; i++) {
|
||||
disk_res.sectors += s->s[i].replicas_reserved;
|
||||
s->s[i].replicas_reserved = 0;
|
||||
|
||||
dirty_sectors -= s->s[i].state == SECTOR_dirty;
|
||||
bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
|
||||
}
|
||||
|
||||
bch2_disk_reservation_put(c, &disk_res);
|
||||
|
||||
bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
|
||||
|
||||
bch2_folio_release(folio);
|
||||
}
|
||||
|
||||
void bch2_set_folio_dirty(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct folio *folio,
|
||||
struct bch2_folio_reservation *res,
|
||||
unsigned offset, unsigned len)
|
||||
{
|
||||
struct bch_folio *s = bch2_folio(folio);
|
||||
unsigned i, dirty_sectors = 0;
|
||||
|
||||
WARN_ON((u64) folio_pos(folio) + offset + len >
|
||||
round_up((u64) i_size_read(&inode->v), block_bytes(c)));
|
||||
|
||||
BUG_ON(!s->uptodate);
|
||||
|
||||
spin_lock(&s->lock);
|
||||
|
||||
for (i = round_down(offset, block_bytes(c)) >> 9;
|
||||
i < round_up(offset + len, block_bytes(c)) >> 9;
|
||||
i++) {
|
||||
unsigned sectors = sectors_to_reserve(&s->s[i],
|
||||
res->disk.nr_replicas);
|
||||
|
||||
/*
|
||||
* This can happen if we race with the error path in
|
||||
* bch2_writepage_io_done():
|
||||
*/
|
||||
sectors = min_t(unsigned, sectors, res->disk.sectors);
|
||||
|
||||
s->s[i].replicas_reserved += sectors;
|
||||
res->disk.sectors -= sectors;
|
||||
|
||||
dirty_sectors += s->s[i].state == SECTOR_unallocated;
|
||||
|
||||
bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
|
||||
}
|
||||
|
||||
spin_unlock(&s->lock);
|
||||
|
||||
bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
|
||||
|
||||
if (!folio_test_dirty(folio))
|
||||
filemap_dirty_folio(inode->v.i_mapping, folio);
|
||||
}
|
||||
|
||||
vm_fault_t bch2_page_fault(struct vm_fault *vmf)
|
||||
{
|
||||
struct file *file = vmf->vma->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct address_space *fdm = faults_disabled_mapping();
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
vm_fault_t ret;
|
||||
|
||||
if (fdm == mapping)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
/* Lock ordering: */
|
||||
if (fdm > mapping) {
|
||||
struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
|
||||
|
||||
if (bch2_pagecache_add_tryget(inode))
|
||||
goto got_lock;
|
||||
|
||||
bch2_pagecache_block_put(fdm_host);
|
||||
|
||||
bch2_pagecache_add_get(inode);
|
||||
bch2_pagecache_add_put(inode);
|
||||
|
||||
bch2_pagecache_block_get(fdm_host);
|
||||
|
||||
/* Signal that lock has been dropped: */
|
||||
set_fdm_dropped_locks();
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
bch2_pagecache_add_get(inode);
|
||||
got_lock:
|
||||
ret = filemap_fault(vmf);
|
||||
bch2_pagecache_add_put(inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
|
||||
{
|
||||
struct folio *folio = page_folio(vmf->page);
|
||||
struct file *file = vmf->vma->vm_file;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct bch2_folio_reservation res;
|
||||
unsigned len;
|
||||
loff_t isize;
|
||||
vm_fault_t ret;
|
||||
|
||||
bch2_folio_reservation_init(c, inode, &res);
|
||||
|
||||
sb_start_pagefault(inode->v.i_sb);
|
||||
file_update_time(file);
|
||||
|
||||
/*
|
||||
* Not strictly necessary, but helps avoid dio writes livelocking in
|
||||
* bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
|
||||
* a bch2_write_invalidate_inode_pages_range() that works without dropping
|
||||
* page lock before invalidating page
|
||||
*/
|
||||
bch2_pagecache_add_get(inode);
|
||||
|
||||
folio_lock(folio);
|
||||
isize = i_size_read(&inode->v);
|
||||
|
||||
if (folio->mapping != mapping || folio_pos(folio) >= isize) {
|
||||
folio_unlock(folio);
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
|
||||
|
||||
if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
|
||||
bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
|
||||
folio_unlock(folio);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
|
||||
bch2_folio_reservation_put(c, inode, &res);
|
||||
|
||||
folio_wait_stable(folio);
|
||||
ret = VM_FAULT_LOCKED;
|
||||
out:
|
||||
bch2_pagecache_add_put(inode);
|
||||
sb_end_pagefault(inode->v.i_sb);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
|
||||
{
|
||||
if (offset || length < folio_size(folio))
|
||||
return;
|
||||
|
||||
bch2_clear_folio_bits(folio);
|
||||
}
|
||||
|
||||
bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
|
||||
{
|
||||
if (folio_test_dirty(folio) || folio_test_writeback(folio))
|
||||
return false;
|
||||
|
||||
bch2_clear_folio_bits(folio);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* fseek: */
|
||||
|
||||
static int folio_data_offset(struct folio *folio, loff_t pos,
|
||||
unsigned min_replicas)
|
||||
{
|
||||
struct bch_folio *s = bch2_folio(folio);
|
||||
unsigned i, sectors = folio_sectors(folio);
|
||||
|
||||
if (s)
|
||||
for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
|
||||
if (s->s[i].state >= SECTOR_dirty &&
|
||||
s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
|
||||
return i << SECTOR_SHIFT;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
loff_t bch2_seek_pagecache_data(struct inode *vinode,
|
||||
loff_t start_offset,
|
||||
loff_t end_offset,
|
||||
unsigned min_replicas,
|
||||
bool nonblock)
|
||||
{
|
||||
struct folio_batch fbatch;
|
||||
pgoff_t start_index = start_offset >> PAGE_SHIFT;
|
||||
pgoff_t end_index = end_offset >> PAGE_SHIFT;
|
||||
pgoff_t index = start_index;
|
||||
unsigned i;
|
||||
loff_t ret;
|
||||
int offset;
|
||||
|
||||
folio_batch_init(&fbatch);
|
||||
|
||||
while (filemap_get_folios(vinode->i_mapping,
|
||||
&index, end_index, &fbatch)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
|
||||
if (!nonblock) {
|
||||
folio_lock(folio);
|
||||
} else if (!folio_trylock(folio)) {
|
||||
folio_batch_release(&fbatch);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
offset = folio_data_offset(folio,
|
||||
max(folio_pos(folio), start_offset),
|
||||
min_replicas);
|
||||
if (offset >= 0) {
|
||||
ret = clamp(folio_pos(folio) + offset,
|
||||
start_offset, end_offset);
|
||||
folio_unlock(folio);
|
||||
folio_batch_release(&fbatch);
|
||||
return ret;
|
||||
}
|
||||
folio_unlock(folio);
|
||||
}
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
return end_offset;
|
||||
}
|
||||
|
||||
static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
|
||||
unsigned min_replicas, bool nonblock)
|
||||
{
|
||||
struct folio *folio;
|
||||
struct bch_folio *s;
|
||||
unsigned i, sectors;
|
||||
bool ret = true;
|
||||
|
||||
folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
|
||||
FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
|
||||
if (folio == ERR_PTR(-EAGAIN))
|
||||
return -EAGAIN;
|
||||
if (IS_ERR_OR_NULL(folio))
|
||||
return true;
|
||||
|
||||
s = bch2_folio(folio);
|
||||
if (!s)
|
||||
goto unlock;
|
||||
|
||||
sectors = folio_sectors(folio);
|
||||
for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
|
||||
if (s->s[i].state < SECTOR_dirty ||
|
||||
s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
|
||||
*offset = max(*offset,
|
||||
folio_pos(folio) + (i << SECTOR_SHIFT));
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
*offset = folio_end_pos(folio);
|
||||
ret = false;
|
||||
unlock:
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
loff_t bch2_seek_pagecache_hole(struct inode *vinode,
|
||||
loff_t start_offset,
|
||||
loff_t end_offset,
|
||||
unsigned min_replicas,
|
||||
bool nonblock)
|
||||
{
|
||||
struct address_space *mapping = vinode->i_mapping;
|
||||
loff_t offset = start_offset;
|
||||
|
||||
while (offset < end_offset &&
|
||||
!folio_hole_offset(mapping, &offset, min_replicas, nonblock))
|
||||
;
|
||||
|
||||
return min(offset, end_offset);
|
||||
}
|
||||
|
||||
int bch2_clamp_data_hole(struct inode *inode,
|
||||
u64 *hole_start,
|
||||
u64 *hole_end,
|
||||
unsigned min_replicas,
|
||||
bool nonblock)
|
||||
{
|
||||
loff_t ret;
|
||||
|
||||
ret = bch2_seek_pagecache_hole(inode,
|
||||
*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
*hole_start = ret;
|
||||
|
||||
if (*hole_start == *hole_end)
|
||||
return 0;
|
||||
|
||||
ret = bch2_seek_pagecache_data(inode,
|
||||
*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
*hole_end = ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* NO_BCACHEFS_FS */
|
175
libbcachefs/fs-io-pagecache.h
Normal file
175
libbcachefs/fs-io-pagecache.h
Normal file
@ -0,0 +1,175 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
|
||||
#define _BCACHEFS_FS_IO_PAGECACHE_H
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
typedef DARRAY(struct folio *) folios;
|
||||
|
||||
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
|
||||
u64, int, gfp_t, folios *);
|
||||
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
|
||||
|
||||
/*
|
||||
* Use u64 for the end pos and sector helpers because if the folio covers the
|
||||
* max supported range of the mapping, the start offset of the next folio
|
||||
* overflows loff_t. This breaks much of the range based processing in the
|
||||
* buffered write path.
|
||||
*/
|
||||
static inline u64 folio_end_pos(struct folio *folio)
|
||||
{
|
||||
return folio_pos(folio) + folio_size(folio);
|
||||
}
|
||||
|
||||
static inline size_t folio_sectors(struct folio *folio)
|
||||
{
|
||||
return PAGE_SECTORS << folio_order(folio);
|
||||
}
|
||||
|
||||
static inline loff_t folio_sector(struct folio *folio)
|
||||
{
|
||||
return folio_pos(folio) >> 9;
|
||||
}
|
||||
|
||||
static inline u64 folio_end_sector(struct folio *folio)
|
||||
{
|
||||
return folio_end_pos(folio) >> 9;
|
||||
}
|
||||
|
||||
#define BCH_FOLIO_SECTOR_STATE() \
|
||||
x(unallocated) \
|
||||
x(reserved) \
|
||||
x(dirty) \
|
||||
x(dirty_reserved) \
|
||||
x(allocated)
|
||||
|
||||
enum bch_folio_sector_state {
|
||||
#define x(n) SECTOR_##n,
|
||||
BCH_FOLIO_SECTOR_STATE()
|
||||
#undef x
|
||||
};
|
||||
|
||||
struct bch_folio_sector {
|
||||
/* Uncompressed, fully allocated replicas (or on disk reservation): */
|
||||
unsigned nr_replicas:4;
|
||||
|
||||
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
|
||||
unsigned replicas_reserved:4;
|
||||
|
||||
/* i_sectors: */
|
||||
enum bch_folio_sector_state state:8;
|
||||
};
|
||||
|
||||
struct bch_folio {
|
||||
spinlock_t lock;
|
||||
atomic_t write_count;
|
||||
/*
|
||||
* Is the sector state up to date with the btree?
|
||||
* (Not the data itself)
|
||||
*/
|
||||
bool uptodate;
|
||||
struct bch_folio_sector s[];
|
||||
};
|
||||
|
||||
/* Helper for when we need to add debug instrumentation: */
|
||||
static inline void bch2_folio_sector_set(struct folio *folio,
|
||||
struct bch_folio *s,
|
||||
unsigned i, unsigned n)
|
||||
{
|
||||
s->s[i].state = n;
|
||||
}
|
||||
|
||||
/* file offset (to folio offset) to bch_folio_sector index */
|
||||
static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
|
||||
{
|
||||
u64 f_offset = pos - folio_pos(folio);
|
||||
BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
|
||||
return f_offset >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
/* for newly allocated folios: */
|
||||
static inline void __bch2_folio_release(struct folio *folio)
|
||||
{
|
||||
kfree(folio_detach_private(folio));
|
||||
}
|
||||
|
||||
static inline void bch2_folio_release(struct folio *folio)
|
||||
{
|
||||
EBUG_ON(!folio_test_locked(folio));
|
||||
__bch2_folio_release(folio);
|
||||
}
|
||||
|
||||
static inline struct bch_folio *__bch2_folio(struct folio *folio)
|
||||
{
|
||||
return folio_has_private(folio)
|
||||
? (struct bch_folio *) folio_get_private(folio)
|
||||
: NULL;
|
||||
}
|
||||
|
||||
static inline struct bch_folio *bch2_folio(struct folio *folio)
|
||||
{
|
||||
EBUG_ON(!folio_test_locked(folio));
|
||||
|
||||
return __bch2_folio(folio);
|
||||
}
|
||||
|
||||
struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
|
||||
struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
|
||||
|
||||
struct bch2_folio_reservation {
|
||||
struct disk_reservation disk;
|
||||
struct quota_res quota;
|
||||
};
|
||||
|
||||
static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
|
||||
{
|
||||
/* XXX: this should not be open coded */
|
||||
return inode->ei_inode.bi_data_replicas
|
||||
? inode->ei_inode.bi_data_replicas - 1
|
||||
: c->opts.data_replicas;
|
||||
}
|
||||
|
||||
static inline void bch2_folio_reservation_init(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct bch2_folio_reservation *res)
|
||||
{
|
||||
memset(res, 0, sizeof(*res));
|
||||
|
||||
res->disk.nr_replicas = inode_nr_replicas(c, inode);
|
||||
}
|
||||
|
||||
int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
|
||||
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
|
||||
|
||||
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
|
||||
void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
|
||||
|
||||
int bch2_get_folio_disk_reservation(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
struct folio *, bool);
|
||||
|
||||
void bch2_folio_reservation_put(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
struct bch2_folio_reservation *);
|
||||
int bch2_folio_reservation_get(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
struct folio *,
|
||||
struct bch2_folio_reservation *,
|
||||
unsigned, unsigned);
|
||||
|
||||
void bch2_set_folio_dirty(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
struct folio *,
|
||||
struct bch2_folio_reservation *,
|
||||
unsigned, unsigned);
|
||||
|
||||
vm_fault_t bch2_page_fault(struct vm_fault *);
|
||||
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
|
||||
void bch2_invalidate_folio(struct folio *, size_t, size_t);
|
||||
bool bch2_release_folio(struct folio *, gfp_t);
|
||||
|
||||
loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
|
||||
loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
|
||||
int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
|
||||
|
||||
#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
|
2812
libbcachefs/fs-io.c
2812
libbcachefs/fs-io.c
File diff suppressed because it is too large
Load Diff
@ -5,29 +5,164 @@
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
#include "buckets.h"
|
||||
#include "fs.h"
|
||||
#include "io_types.h"
|
||||
#include "quota.h"
|
||||
|
||||
#include <linux/uio.h>
|
||||
|
||||
struct quota_res;
|
||||
struct folio_vec {
|
||||
struct folio *fv_folio;
|
||||
size_t fv_offset;
|
||||
size_t fv_len;
|
||||
};
|
||||
|
||||
static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
|
||||
{
|
||||
|
||||
struct folio *folio = page_folio(bv.bv_page);
|
||||
size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
|
||||
bv.bv_offset;
|
||||
size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
|
||||
|
||||
return (struct folio_vec) {
|
||||
.fv_folio = folio,
|
||||
.fv_offset = offset,
|
||||
.fv_len = len,
|
||||
};
|
||||
}
|
||||
|
||||
static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
|
||||
struct bvec_iter iter)
|
||||
{
|
||||
return biovec_to_foliovec(bio_iter_iovec(bio, iter));
|
||||
}
|
||||
|
||||
#define __bio_for_each_folio(bvl, bio, iter, start) \
|
||||
for (iter = (start); \
|
||||
(iter).bi_size && \
|
||||
((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
|
||||
bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
|
||||
|
||||
/**
|
||||
* bio_for_each_folio - iterate over folios within a bio
|
||||
*
|
||||
* Like other non-_all versions, this iterates over what bio->bi_iter currently
|
||||
* points to. This version is for drivers, where the bio may have previously
|
||||
* been split or cloned.
|
||||
*/
|
||||
#define bio_for_each_folio(bvl, bio, iter) \
|
||||
__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
|
||||
|
||||
struct quota_res {
|
||||
u64 sectors;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_QUOTA
|
||||
|
||||
static inline void __bch2_quota_reservation_put(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct quota_res *res)
|
||||
{
|
||||
BUG_ON(res->sectors > inode->ei_quota_reserved);
|
||||
|
||||
bch2_quota_acct(c, inode->ei_qid, Q_SPC,
|
||||
-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
|
||||
inode->ei_quota_reserved -= res->sectors;
|
||||
res->sectors = 0;
|
||||
}
|
||||
|
||||
static inline void bch2_quota_reservation_put(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct quota_res *res)
|
||||
{
|
||||
if (res->sectors) {
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
__bch2_quota_reservation_put(c, inode, res);
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int bch2_quota_reservation_add(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct quota_res *res,
|
||||
u64 sectors,
|
||||
bool check_enospc)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
|
||||
check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
|
||||
if (likely(!ret)) {
|
||||
inode->ei_quota_reserved += sectors;
|
||||
res->sectors += sectors;
|
||||
}
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void __bch2_quota_reservation_put(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct quota_res *res) {}
|
||||
|
||||
static void bch2_quota_reservation_put(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct quota_res *res) {}
|
||||
|
||||
static int bch2_quota_reservation_add(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
struct quota_res *res,
|
||||
unsigned sectors,
|
||||
bool check_enospc)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
|
||||
struct quota_res *, s64);
|
||||
|
||||
static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
||||
struct quota_res *quota_res, s64 sectors)
|
||||
{
|
||||
if (sectors) {
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
__bch2_i_sectors_acct(c, inode, quota_res, sectors);
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct address_space *faults_disabled_mapping(void)
|
||||
{
|
||||
return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
|
||||
}
|
||||
|
||||
static inline void set_fdm_dropped_locks(void)
|
||||
{
|
||||
current->faults_disabled_mapping =
|
||||
(void *) (((unsigned long) current->faults_disabled_mapping)|1);
|
||||
}
|
||||
|
||||
static inline bool fdm_dropped_locks(void)
|
||||
{
|
||||
return ((unsigned long) current->faults_disabled_mapping) & 1;
|
||||
}
|
||||
|
||||
void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
|
||||
struct bch_inode_info *, struct closure *);
|
||||
|
||||
int __must_check bch2_write_inode_size(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
loff_t, unsigned);
|
||||
|
||||
int bch2_read_folio(struct file *, struct folio *);
|
||||
|
||||
int bch2_writepages(struct address_space *, struct writeback_control *);
|
||||
void bch2_readahead(struct readahead_control *);
|
||||
|
||||
int bch2_write_begin(struct file *, struct address_space *, loff_t,
|
||||
unsigned, struct page **, void **);
|
||||
int bch2_write_end(struct file *, struct address_space *, loff_t,
|
||||
unsigned, unsigned, struct page *, void *);
|
||||
|
||||
ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
|
||||
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
|
||||
|
||||
int bch2_fsync(struct file *, loff_t, loff_t, int);
|
||||
|
||||
int bch2_truncate(struct mnt_idmap *,
|
||||
@ -39,11 +174,6 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
|
||||
|
||||
loff_t bch2_llseek(struct file *, loff_t, int);
|
||||
|
||||
vm_fault_t bch2_page_fault(struct vm_fault *);
|
||||
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
|
||||
void bch2_invalidate_folio(struct folio *, size_t, size_t);
|
||||
bool bch2_release_folio(struct folio *, gfp_t);
|
||||
|
||||
void bch2_fs_fsio_exit(struct bch_fs *);
|
||||
int bch2_fs_fsio_init(struct bch_fs *);
|
||||
#else
|
||||
|
@ -14,6 +14,8 @@
|
||||
#include "fs-common.h"
|
||||
#include "fs-io.h"
|
||||
#include "fs-ioctl.h"
|
||||
#include "fs-io-buffered.h"
|
||||
#include "fs-io-pagecache.h"
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
@ -203,7 +205,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
|
||||
|
||||
if (ret) {
|
||||
iget_failed(&inode->v);
|
||||
return ERR_PTR(ret);
|
||||
return ERR_PTR(bch2_err_class(ret));
|
||||
}
|
||||
|
||||
mutex_lock(&c->vfs_inodes_lock);
|
||||
@ -1000,11 +1002,16 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
|
||||
{
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
int ret;
|
||||
|
||||
if (!dir_emit_dots(file, ctx))
|
||||
return 0;
|
||||
|
||||
return bch2_readdir(c, inode_inum(inode), ctx);
|
||||
ret = bch2_readdir(c, inode_inum(inode), ctx);
|
||||
if (ret)
|
||||
bch_err_fn(c, ret);
|
||||
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
static const struct file_operations bch_file_operations = {
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
#include "keylist.h"
|
||||
#include "recovery.h"
|
||||
#include "subvolume.h"
|
||||
#include "super.h"
|
||||
#include "xattr.h"
|
||||
|
@ -348,6 +348,8 @@ int bch2_inode_peek(struct btree_trans *trans,
|
||||
return 0;
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -520,23 +522,25 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
|
||||
__bch2_inode_unpacked_to_text(out, &inode);
|
||||
}
|
||||
|
||||
static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
|
||||
static inline u64 bkey_inode_flags(struct bkey_s_c k)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_inode:
|
||||
return bkey_s_c_to_inode(k).v->bi_flags &
|
||||
cpu_to_le32(BCH_INODE_UNLINKED);
|
||||
return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
|
||||
case KEY_TYPE_inode_v2:
|
||||
return bkey_s_c_to_inode_v2(k).v->bi_flags &
|
||||
cpu_to_le32(BCH_INODE_UNLINKED);
|
||||
return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
|
||||
case KEY_TYPE_inode_v3:
|
||||
return bkey_s_c_to_inode_v3(k).v->bi_flags &
|
||||
cpu_to_le64(BCH_INODE_UNLINKED);
|
||||
return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
|
||||
default:
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
|
||||
{
|
||||
return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
|
||||
}
|
||||
|
||||
int bch2_trans_mark_inode(struct btree_trans *trans,
|
||||
enum btree_id btree_id, unsigned level,
|
||||
struct bkey_s_c old,
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "journal_reclaim.h"
|
||||
#include "journal_seq_blacklist.h"
|
||||
#include "replicas.h"
|
||||
#include "sb-clean.h"
|
||||
#include "trace.h"
|
||||
|
||||
static struct nonce journal_nonce(const struct jset *jset)
|
||||
@ -208,33 +209,41 @@ static void journal_entry_null_range(void *start, void *end)
|
||||
#define JOURNAL_ENTRY_BAD 7
|
||||
|
||||
static void journal_entry_err_msg(struct printbuf *out,
|
||||
u32 version,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry)
|
||||
{
|
||||
prt_str(out, "invalid journal entry ");
|
||||
if (entry)
|
||||
prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
|
||||
prt_str(out, "invalid journal entry, version=");
|
||||
bch2_version_to_text(out, version);
|
||||
|
||||
if (entry) {
|
||||
prt_str(out, " type=");
|
||||
prt_str(out, bch2_jset_entry_types[entry->type]);
|
||||
}
|
||||
|
||||
if (!jset) {
|
||||
prt_printf(out, " in superblock");
|
||||
} else {
|
||||
|
||||
prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
|
||||
|
||||
if (entry)
|
||||
prt_printf(out, " offset=%zi/%u",
|
||||
(u64 *) entry - jset->_data,
|
||||
le32_to_cpu(jset->u64s));
|
||||
}
|
||||
|
||||
if (!jset)
|
||||
prt_printf(out, "in superblock");
|
||||
else if (!entry)
|
||||
prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
|
||||
else
|
||||
prt_printf(out, "at offset %zi/%u seq %llu",
|
||||
(u64 *) entry - jset->_data,
|
||||
le32_to_cpu(jset->u64s),
|
||||
le64_to_cpu(jset->seq));
|
||||
prt_str(out, ": ");
|
||||
}
|
||||
|
||||
#define journal_entry_err(c, jset, entry, msg, ...) \
|
||||
#define journal_entry_err(c, version, jset, entry, msg, ...) \
|
||||
({ \
|
||||
struct printbuf buf = PRINTBUF; \
|
||||
\
|
||||
journal_entry_err_msg(&buf, jset, entry); \
|
||||
journal_entry_err_msg(&buf, version, jset, entry); \
|
||||
prt_printf(&buf, msg, ##__VA_ARGS__); \
|
||||
\
|
||||
switch (write) { \
|
||||
switch (flags & BKEY_INVALID_WRITE) { \
|
||||
case READ: \
|
||||
mustfix_fsck_err(c, "%s", buf.buf); \
|
||||
break; \
|
||||
@ -251,8 +260,8 @@ static void journal_entry_err_msg(struct printbuf *out,
|
||||
true; \
|
||||
})
|
||||
|
||||
#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \
|
||||
((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
|
||||
#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \
|
||||
((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
|
||||
|
||||
#define FSCK_DELETED_KEY 5
|
||||
|
||||
@ -261,13 +270,15 @@ static int journal_validate_key(struct bch_fs *c,
|
||||
struct jset_entry *entry,
|
||||
unsigned level, enum btree_id btree_id,
|
||||
struct bkey_i *k,
|
||||
unsigned version, int big_endian, int write)
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
int write = flags & BKEY_INVALID_WRITE;
|
||||
void *next = vstruct_next(entry);
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
|
||||
if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
|
||||
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
|
||||
journal_entry_null_range(vstruct_next(entry), next);
|
||||
return FSCK_DELETED_KEY;
|
||||
@ -275,7 +286,7 @@ static int journal_validate_key(struct bch_fs *c,
|
||||
|
||||
if (journal_entry_err_on((void *) bkey_next(k) >
|
||||
(void *) vstruct_next(entry),
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"extends past end of journal entry")) {
|
||||
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
|
||||
journal_entry_null_range(vstruct_next(entry), next);
|
||||
@ -283,7 +294,7 @@ static int journal_validate_key(struct bch_fs *c,
|
||||
}
|
||||
|
||||
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"bad format %u", k->k.format)) {
|
||||
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
|
||||
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
|
||||
@ -298,11 +309,7 @@ static int journal_validate_key(struct bch_fs *c,
|
||||
if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
|
||||
__btree_node_type(level, btree_id), write, &buf)) {
|
||||
printbuf_reset(&buf);
|
||||
prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
|
||||
bch2_jset_entry_types[entry->type],
|
||||
(u64 *) entry - jset->_data,
|
||||
le32_to_cpu(jset->u64s),
|
||||
le64_to_cpu(jset->seq));
|
||||
journal_entry_err_msg(&buf, version, jset, entry);
|
||||
prt_newline(&buf);
|
||||
printbuf_indent_add(&buf, 2);
|
||||
|
||||
@ -312,6 +319,7 @@ static int journal_validate_key(struct bch_fs *c,
|
||||
__btree_node_type(level, btree_id), write, &buf);
|
||||
|
||||
mustfix_fsck_err(c, "%s", buf.buf);
|
||||
BUG();
|
||||
|
||||
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
|
||||
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
|
||||
@ -330,9 +338,10 @@ fsck_err:
|
||||
}
|
||||
|
||||
static int journal_entry_btree_keys_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct bkey_i *k = entry->start;
|
||||
|
||||
@ -341,7 +350,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
|
||||
entry->level,
|
||||
entry->btree_id,
|
||||
k, version, big_endian,
|
||||
write|BKEY_INVALID_JOURNAL);
|
||||
flags|BKEY_INVALID_JOURNAL);
|
||||
if (ret == FSCK_DELETED_KEY)
|
||||
continue;
|
||||
|
||||
@ -369,16 +378,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
|
||||
}
|
||||
|
||||
static int journal_entry_btree_root_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct bkey_i *k = entry->start;
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(!entry->u64s ||
|
||||
le16_to_cpu(entry->u64s) != k->k.u64s,
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"invalid btree root journal entry: wrong number of keys")) {
|
||||
void *next = vstruct_next(entry);
|
||||
/*
|
||||
@ -392,7 +402,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
|
||||
}
|
||||
|
||||
return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
|
||||
version, big_endian, write);
|
||||
version, big_endian, flags);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -404,9 +414,10 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
|
||||
}
|
||||
|
||||
static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
/* obsolete, don't care: */
|
||||
return 0;
|
||||
@ -418,14 +429,15 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
|
||||
}
|
||||
|
||||
static int journal_entry_blacklist_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"invalid journal seq blacklist entry: bad size")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
}
|
||||
@ -443,15 +455,16 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
|
||||
}
|
||||
|
||||
static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct jset_entry_blacklist_v2 *bl_entry;
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"invalid journal seq blacklist entry: bad size")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
goto out;
|
||||
@ -461,7 +474,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
|
||||
|
||||
if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
|
||||
le64_to_cpu(bl_entry->end),
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"invalid journal seq blacklist entry: start > end")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
}
|
||||
@ -482,9 +495,10 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
|
||||
}
|
||||
|
||||
static int journal_entry_usage_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(entry, struct jset_entry_usage, entry);
|
||||
@ -492,7 +506,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(bytes < sizeof(*u),
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"invalid journal entry usage: bad size")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
@ -514,9 +528,10 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
}
|
||||
|
||||
static int journal_entry_data_usage_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct jset_entry_data_usage *u =
|
||||
container_of(entry, struct jset_entry_data_usage, entry);
|
||||
@ -525,7 +540,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
|
||||
|
||||
if (journal_entry_err_on(bytes < sizeof(*u) ||
|
||||
bytes < sizeof(*u) + u->r.nr_devs,
|
||||
c, jset, entry,
|
||||
c, version, jset, entry,
|
||||
"invalid journal entry usage: bad size")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
@ -546,9 +561,10 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
|
||||
}
|
||||
|
||||
static int journal_entry_clock_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct jset_entry_clock *clock =
|
||||
container_of(entry, struct jset_entry_clock, entry);
|
||||
@ -556,13 +572,13 @@ static int journal_entry_clock_validate(struct bch_fs *c,
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(bytes != sizeof(*clock),
|
||||
c, jset, entry, "bad size")) {
|
||||
c, version, jset, entry, "bad size")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (journal_entry_err_on(clock->rw > 1,
|
||||
c, jset, entry, "bad rw")) {
|
||||
c, version, jset, entry, "bad rw")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
}
|
||||
@ -581,9 +597,10 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
}
|
||||
|
||||
static int journal_entry_dev_usage_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct jset_entry_dev_usage *u =
|
||||
container_of(entry, struct jset_entry_dev_usage, entry);
|
||||
@ -593,7 +610,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
|
||||
int ret = 0;
|
||||
|
||||
if (journal_entry_err_on(bytes < expected,
|
||||
c, jset, entry, "bad size (%u < %u)",
|
||||
c, version, jset, entry, "bad size (%u < %u)",
|
||||
bytes, expected)) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
@ -602,13 +619,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
|
||||
dev = le32_to_cpu(u->dev);
|
||||
|
||||
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
|
||||
c, jset, entry, "bad dev")) {
|
||||
c, version, jset, entry, "bad dev")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (journal_entry_err_on(u->pad,
|
||||
c, jset, entry, "bad pad")) {
|
||||
c, version, jset, entry, "bad pad")) {
|
||||
journal_entry_null_range(entry, vstruct_next(entry));
|
||||
return ret;
|
||||
}
|
||||
@ -641,9 +658,10 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
|
||||
}
|
||||
|
||||
static int journal_entry_log_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -658,9 +676,10 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
}
|
||||
|
||||
static int journal_entry_overwrite_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
return journal_entry_btree_keys_validate(c, jset, entry,
|
||||
version, big_endian, READ);
|
||||
@ -674,7 +693,8 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
|
||||
|
||||
struct jset_entry_ops {
|
||||
int (*validate)(struct bch_fs *, struct jset *,
|
||||
struct jset_entry *, unsigned, int, int);
|
||||
struct jset_entry *, unsigned, int,
|
||||
enum bkey_invalid_flags);
|
||||
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
|
||||
};
|
||||
|
||||
@ -691,11 +711,12 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
|
||||
int bch2_journal_entry_validate(struct bch_fs *c,
|
||||
struct jset *jset,
|
||||
struct jset_entry *entry,
|
||||
unsigned version, int big_endian, int write)
|
||||
unsigned version, int big_endian,
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
return entry->type < BCH_JSET_ENTRY_NR
|
||||
? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
|
||||
version, big_endian, write)
|
||||
version, big_endian, flags)
|
||||
: 0;
|
||||
}
|
||||
|
||||
@ -711,22 +732,22 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
}
|
||||
|
||||
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
|
||||
int write)
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
struct jset_entry *entry;
|
||||
unsigned version = le32_to_cpu(jset->version);
|
||||
int ret = 0;
|
||||
|
||||
vstruct_for_each(jset, entry) {
|
||||
if (journal_entry_err_on(vstruct_next(entry) >
|
||||
vstruct_last(jset), c, jset, entry,
|
||||
if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
|
||||
c, version, jset, entry,
|
||||
"journal entry extends past end of jset")) {
|
||||
jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
|
||||
break;
|
||||
}
|
||||
|
||||
ret = bch2_journal_entry_validate(c, jset, entry,
|
||||
le32_to_cpu(jset->version),
|
||||
JSET_BIG_ENDIAN(jset), write);
|
||||
version, JSET_BIG_ENDIAN(jset), flags);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
@ -737,7 +758,7 @@ fsck_err:
|
||||
static int jset_validate(struct bch_fs *c,
|
||||
struct bch_dev *ca,
|
||||
struct jset *jset, u64 sector,
|
||||
int write)
|
||||
enum bkey_invalid_flags flags)
|
||||
{
|
||||
unsigned version;
|
||||
int ret = 0;
|
||||
@ -746,7 +767,8 @@ static int jset_validate(struct bch_fs *c,
|
||||
return JOURNAL_ENTRY_NONE;
|
||||
|
||||
version = le32_to_cpu(jset->version);
|
||||
if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
|
||||
if (journal_entry_err_on(!bch2_version_compatible(version),
|
||||
c, version, jset, NULL,
|
||||
"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
|
||||
ca ? ca->name : c->name,
|
||||
sector, le64_to_cpu(jset->seq),
|
||||
@ -757,7 +779,7 @@ static int jset_validate(struct bch_fs *c,
|
||||
}
|
||||
|
||||
if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
|
||||
c, jset, NULL,
|
||||
c, version, jset, NULL,
|
||||
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
|
||||
ca ? ca->name : c->name,
|
||||
sector, le64_to_cpu(jset->seq),
|
||||
@ -767,7 +789,7 @@ static int jset_validate(struct bch_fs *c,
|
||||
/* last_seq is ignored when JSET_NO_FLUSH is true */
|
||||
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
|
||||
le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
|
||||
c, jset, NULL,
|
||||
c, version, jset, NULL,
|
||||
"invalid journal entry: last_seq > seq (%llu > %llu)",
|
||||
le64_to_cpu(jset->last_seq),
|
||||
le64_to_cpu(jset->seq))) {
|
||||
@ -775,7 +797,7 @@ static int jset_validate(struct bch_fs *c,
|
||||
return JOURNAL_ENTRY_BAD;
|
||||
}
|
||||
|
||||
ret = jset_validate_entries(c, jset, write);
|
||||
ret = jset_validate_entries(c, jset, flags);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -788,14 +810,15 @@ static int jset_validate_early(struct bch_fs *c,
|
||||
{
|
||||
size_t bytes = vstruct_bytes(jset);
|
||||
unsigned version;
|
||||
int write = READ;
|
||||
enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
|
||||
int ret = 0;
|
||||
|
||||
if (le64_to_cpu(jset->magic) != jset_magic(c))
|
||||
return JOURNAL_ENTRY_NONE;
|
||||
|
||||
version = le32_to_cpu(jset->version);
|
||||
if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
|
||||
if (journal_entry_err_on(!bch2_version_compatible(version),
|
||||
c, version, jset, NULL,
|
||||
"%s sector %llu seq %llu: unknown journal entry version %u.%u",
|
||||
ca ? ca->name : c->name,
|
||||
sector, le64_to_cpu(jset->seq),
|
||||
@ -810,7 +833,7 @@ static int jset_validate_early(struct bch_fs *c,
|
||||
return JOURNAL_ENTRY_REREAD;
|
||||
|
||||
if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
|
||||
c, jset, NULL,
|
||||
c, version, jset, NULL,
|
||||
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
|
||||
ca ? ca->name : c->name,
|
||||
sector, le64_to_cpu(jset->seq), bytes))
|
||||
@ -1127,7 +1150,7 @@ int bch2_journal_read(struct bch_fs *c,
|
||||
* those entries will be blacklisted:
|
||||
*/
|
||||
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
|
||||
int write = READ;
|
||||
enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
|
||||
|
||||
i = *_i;
|
||||
|
||||
@ -1149,7 +1172,7 @@ int bch2_journal_read(struct bch_fs *c,
|
||||
}
|
||||
|
||||
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
|
||||
c, &i->j, NULL,
|
||||
c, le32_to_cpu(i->j.version), &i->j, NULL,
|
||||
"invalid journal entry: last_seq > seq (%llu > %llu)",
|
||||
le64_to_cpu(i->j.last_seq),
|
||||
le64_to_cpu(i->j.seq)))
|
||||
|
@ -50,7 +50,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
||||
jset_entry_for_each_key(entry, k)
|
||||
|
||||
int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
|
||||
struct jset_entry *, unsigned, int, int);
|
||||
struct jset_entry *, unsigned, int,
|
||||
enum bkey_invalid_flags);
|
||||
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
|
||||
struct jset_entry *);
|
||||
|
||||
|
@ -3,13 +3,14 @@
|
||||
#include "bcachefs.h"
|
||||
#include "btree_key_cache.h"
|
||||
#include "btree_update.h"
|
||||
#include "buckets.h"
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
#include "sb-members.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/kthread.h>
|
||||
|
@ -220,8 +220,10 @@ static int bch2_copygc(struct btree_trans *trans,
|
||||
|
||||
f = move_bucket_in_flight_add(buckets_in_flight, *i);
|
||||
ret = PTR_ERR_OR_ZERO(f);
|
||||
if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
|
||||
if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
|
||||
ret = 0;
|
||||
continue;
|
||||
}
|
||||
if (ret == -ENOMEM) { /* flush IO, continue later */
|
||||
ret = 0;
|
||||
break;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "bkey_buf.h"
|
||||
#include "alloc_background.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_io.h"
|
||||
@ -23,6 +24,7 @@
|
||||
#include "quota.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
#include "sb-clean.h"
|
||||
#include "subvolume.h"
|
||||
#include "super-io.h"
|
||||
|
||||
@ -57,524 +59,6 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
|
||||
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
|
||||
}
|
||||
|
||||
/* iterate over keys read from the journal: */
|
||||
|
||||
static int __journal_key_cmp(enum btree_id l_btree_id,
|
||||
unsigned l_level,
|
||||
struct bpos l_pos,
|
||||
const struct journal_key *r)
|
||||
{
|
||||
return (cmp_int(l_btree_id, r->btree_id) ?:
|
||||
cmp_int(l_level, r->level) ?:
|
||||
bpos_cmp(l_pos, r->k->k.p));
|
||||
}
|
||||
|
||||
static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
|
||||
{
|
||||
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
|
||||
}
|
||||
|
||||
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
|
||||
{
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
|
||||
if (idx >= keys->gap)
|
||||
idx += gap_size;
|
||||
return idx;
|
||||
}
|
||||
|
||||
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
|
||||
{
|
||||
return keys->d + idx_to_pos(keys, idx);
|
||||
}
|
||||
|
||||
static size_t __bch2_journal_key_search(struct journal_keys *keys,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
size_t l = 0, r = keys->nr, m;
|
||||
|
||||
while (l < r) {
|
||||
m = l + ((r - l) >> 1);
|
||||
if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
|
||||
l = m + 1;
|
||||
else
|
||||
r = m;
|
||||
}
|
||||
|
||||
BUG_ON(l < keys->nr &&
|
||||
__journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
|
||||
|
||||
BUG_ON(l &&
|
||||
__journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
|
||||
|
||||
return l;
|
||||
}
|
||||
|
||||
static size_t bch2_journal_key_search(struct journal_keys *keys,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos,
|
||||
struct bpos end_pos, size_t *idx)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
unsigned iters = 0;
|
||||
struct journal_key *k;
|
||||
search:
|
||||
if (!*idx)
|
||||
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
|
||||
|
||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
|
||||
return NULL;
|
||||
|
||||
if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
|
||||
!k->overwritten)
|
||||
return k->k;
|
||||
|
||||
(*idx)++;
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
*idx = 0;
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
size_t idx = 0;
|
||||
|
||||
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
|
||||
}
|
||||
|
||||
static void journal_iters_fix(struct bch_fs *c)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
/* The key we just inserted is immediately before the gap: */
|
||||
size_t gap_end = keys->gap + (keys->size - keys->nr);
|
||||
struct btree_and_journal_iter *iter;
|
||||
|
||||
/*
|
||||
* If an iterator points one after the key we just inserted, decrement
|
||||
* the iterator so it points at the key we just inserted - if the
|
||||
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
|
||||
* handle that:
|
||||
*/
|
||||
list_for_each_entry(iter, &c->journal_iters, journal.list)
|
||||
if (iter->journal.idx == gap_end)
|
||||
iter->journal.idx = keys->gap - 1;
|
||||
}
|
||||
|
||||
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
struct journal_iter *iter;
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
|
||||
list_for_each_entry(iter, &c->journal_iters, list) {
|
||||
if (iter->idx > old_gap)
|
||||
iter->idx -= gap_size;
|
||||
if (iter->idx >= new_gap)
|
||||
iter->idx += gap_size;
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
|
||||
unsigned level, struct bkey_i *k)
|
||||
{
|
||||
struct journal_key n = {
|
||||
.btree_id = id,
|
||||
.level = level,
|
||||
.k = k,
|
||||
.allocated = true,
|
||||
/*
|
||||
* Ensure these keys are done last by journal replay, to unblock
|
||||
* journal reclaim:
|
||||
*/
|
||||
.journal_seq = U32_MAX,
|
||||
};
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
|
||||
|
||||
BUG_ON(test_bit(BCH_FS_RW, &c->flags));
|
||||
|
||||
if (idx < keys->size &&
|
||||
journal_key_cmp(&n, &keys->d[idx]) == 0) {
|
||||
if (keys->d[idx].allocated)
|
||||
kfree(keys->d[idx].k);
|
||||
keys->d[idx] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (idx > keys->gap)
|
||||
idx -= keys->size - keys->nr;
|
||||
|
||||
if (keys->nr == keys->size) {
|
||||
struct journal_keys new_keys = {
|
||||
.nr = keys->nr,
|
||||
.size = max_t(size_t, keys->size, 8) * 2,
|
||||
};
|
||||
|
||||
new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
|
||||
if (!new_keys.d) {
|
||||
bch_err(c, "%s: error allocating new key array (size %zu)",
|
||||
__func__, new_keys.size);
|
||||
return -BCH_ERR_ENOMEM_journal_key_insert;
|
||||
}
|
||||
|
||||
/* Since @keys was full, there was no gap: */
|
||||
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
|
||||
kvfree(keys->d);
|
||||
*keys = new_keys;
|
||||
|
||||
/* And now the gap is at the end: */
|
||||
keys->gap = keys->nr;
|
||||
}
|
||||
|
||||
journal_iters_move_gap(c, keys->gap, idx);
|
||||
|
||||
move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
|
||||
keys->gap = idx;
|
||||
|
||||
keys->nr++;
|
||||
keys->d[keys->gap++] = n;
|
||||
|
||||
journal_iters_fix(c);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Can only be used from the recovery thread while we're still RO - can't be
|
||||
* used once we've got RW, as journal_keys is at that point used by multiple
|
||||
* threads:
|
||||
*/
|
||||
int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
|
||||
unsigned level, struct bkey_i *k)
|
||||
{
|
||||
struct bkey_i *n;
|
||||
int ret;
|
||||
|
||||
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
|
||||
if (!n)
|
||||
return -BCH_ERR_ENOMEM_journal_key_insert;
|
||||
|
||||
bkey_copy(n, k);
|
||||
ret = bch2_journal_key_insert_take(c, id, level, n);
|
||||
if (ret)
|
||||
kfree(n);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
struct bkey_i whiteout;
|
||||
|
||||
bkey_init(&whiteout.k);
|
||||
whiteout.k.p = pos;
|
||||
|
||||
return bch2_journal_key_insert(c, id, level, &whiteout);
|
||||
}
|
||||
|
||||
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
|
||||
|
||||
if (idx < keys->size &&
|
||||
keys->d[idx].btree_id == btree &&
|
||||
keys->d[idx].level == level &&
|
||||
bpos_eq(keys->d[idx].k->k.p, pos))
|
||||
keys->d[idx].overwritten = true;
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||
{
|
||||
if (iter->idx < iter->keys->size) {
|
||||
iter->idx++;
|
||||
if (iter->idx == iter->keys->gap)
|
||||
iter->idx += iter->keys->size - iter->keys->nr;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
||||
{
|
||||
struct journal_key *k = iter->keys->d + iter->idx;
|
||||
|
||||
while (k < iter->keys->d + iter->keys->size &&
|
||||
k->btree_id == iter->btree_id &&
|
||||
k->level == iter->level) {
|
||||
if (!k->overwritten)
|
||||
return bkey_i_to_s_c(k->k);
|
||||
|
||||
bch2_journal_iter_advance(iter);
|
||||
k = iter->keys->d + iter->idx;
|
||||
}
|
||||
|
||||
return bkey_s_c_null;
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_exit(struct journal_iter *iter)
|
||||
{
|
||||
list_del(&iter->list);
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_init(struct bch_fs *c,
|
||||
struct journal_iter *iter,
|
||||
enum btree_id id, unsigned level,
|
||||
struct bpos pos)
|
||||
{
|
||||
iter->btree_id = id;
|
||||
iter->level = level;
|
||||
iter->keys = &c->journal_keys;
|
||||
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
|
||||
}
|
||||
|
||||
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
|
||||
iter->b, &iter->unpacked);
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
if (bpos_eq(iter->pos, SPOS_MAX))
|
||||
iter->at_end = true;
|
||||
else
|
||||
iter->pos = bpos_successor(iter->pos);
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
struct bkey_s_c btree_k, journal_k, ret;
|
||||
again:
|
||||
if (iter->at_end)
|
||||
return bkey_s_c_null;
|
||||
|
||||
while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
|
||||
bpos_lt(btree_k.k->p, iter->pos))
|
||||
bch2_journal_iter_advance_btree(iter);
|
||||
|
||||
while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
|
||||
bpos_lt(journal_k.k->p, iter->pos))
|
||||
bch2_journal_iter_advance(&iter->journal);
|
||||
|
||||
ret = journal_k.k &&
|
||||
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
|
||||
? journal_k
|
||||
: btree_k;
|
||||
|
||||
if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
|
||||
ret = bkey_s_c_null;
|
||||
|
||||
if (ret.k) {
|
||||
iter->pos = ret.k->p;
|
||||
if (bkey_deleted(ret.k)) {
|
||||
bch2_btree_and_journal_iter_advance(iter);
|
||||
goto again;
|
||||
}
|
||||
} else {
|
||||
iter->pos = SPOS_MAX;
|
||||
iter->at_end = true;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
bch2_journal_iter_exit(&iter->journal);
|
||||
}
|
||||
|
||||
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
|
||||
struct bch_fs *c,
|
||||
struct btree *b,
|
||||
struct btree_node_iter node_iter,
|
||||
struct bpos pos)
|
||||
{
|
||||
memset(iter, 0, sizeof(*iter));
|
||||
|
||||
iter->b = b;
|
||||
iter->node_iter = node_iter;
|
||||
bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
|
||||
INIT_LIST_HEAD(&iter->journal.list);
|
||||
iter->pos = b->data->min_key;
|
||||
iter->at_end = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* this version is used by btree_gc before filesystem has gone RW and
|
||||
* multithreaded, so uses the journal_iters list:
|
||||
*/
|
||||
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
|
||||
struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
struct btree_node_iter node_iter;
|
||||
|
||||
bch2_btree_node_iter_init_from_start(&node_iter, b);
|
||||
__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
|
||||
list_add(&iter->journal.list, &c->journal_iters);
|
||||
}
|
||||
|
||||
/* sort and dedup all keys in the journal: */
|
||||
|
||||
void bch2_journal_entries_free(struct bch_fs *c)
|
||||
{
|
||||
struct journal_replay **i;
|
||||
struct genradix_iter iter;
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, i)
|
||||
if (*i)
|
||||
kvpfree(*i, offsetof(struct journal_replay, j) +
|
||||
vstruct_bytes(&(*i)->j));
|
||||
genradix_free(&c->journal_entries);
|
||||
}
|
||||
|
||||
/*
|
||||
* When keys compare equal, oldest compares first:
|
||||
*/
|
||||
static int journal_sort_key_cmp(const void *_l, const void *_r)
|
||||
{
|
||||
const struct journal_key *l = _l;
|
||||
const struct journal_key *r = _r;
|
||||
|
||||
return journal_key_cmp(l, r) ?:
|
||||
cmp_int(l->journal_seq, r->journal_seq) ?:
|
||||
cmp_int(l->journal_offset, r->journal_offset);
|
||||
}
|
||||
|
||||
void bch2_journal_keys_free(struct journal_keys *keys)
|
||||
{
|
||||
struct journal_key *i;
|
||||
|
||||
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
|
||||
keys->gap = keys->nr;
|
||||
|
||||
for (i = keys->d; i < keys->d + keys->nr; i++)
|
||||
if (i->allocated)
|
||||
kfree(i->k);
|
||||
|
||||
kvfree(keys->d);
|
||||
keys->d = NULL;
|
||||
keys->nr = keys->gap = keys->size = 0;
|
||||
}
|
||||
|
||||
static void __journal_keys_sort(struct journal_keys *keys)
|
||||
{
|
||||
struct journal_key *src, *dst;
|
||||
|
||||
sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
|
||||
|
||||
src = dst = keys->d;
|
||||
while (src < keys->d + keys->nr) {
|
||||
while (src + 1 < keys->d + keys->nr &&
|
||||
src[0].btree_id == src[1].btree_id &&
|
||||
src[0].level == src[1].level &&
|
||||
bpos_eq(src[0].k->k.p, src[1].k->k.p))
|
||||
src++;
|
||||
|
||||
*dst++ = *src++;
|
||||
}
|
||||
|
||||
keys->nr = dst - keys->d;
|
||||
}
|
||||
|
||||
static int journal_keys_sort(struct bch_fs *c)
|
||||
{
|
||||
struct genradix_iter iter;
|
||||
struct journal_replay *i, **_i;
|
||||
struct jset_entry *entry;
|
||||
struct bkey_i *k;
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t nr_keys = 0, nr_read = 0;
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, _i) {
|
||||
i = *_i;
|
||||
|
||||
if (!i || i->ignore)
|
||||
continue;
|
||||
|
||||
for_each_jset_key(k, entry, &i->j)
|
||||
nr_keys++;
|
||||
}
|
||||
|
||||
if (!nr_keys)
|
||||
return 0;
|
||||
|
||||
keys->size = roundup_pow_of_two(nr_keys);
|
||||
|
||||
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
|
||||
if (!keys->d) {
|
||||
bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
|
||||
nr_keys);
|
||||
|
||||
do {
|
||||
keys->size >>= 1;
|
||||
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
|
||||
} while (!keys->d && keys->size > nr_keys / 8);
|
||||
|
||||
if (!keys->d) {
|
||||
bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
|
||||
keys->size);
|
||||
return -BCH_ERR_ENOMEM_journal_keys_sort;
|
||||
}
|
||||
}
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, _i) {
|
||||
i = *_i;
|
||||
|
||||
if (!i || i->ignore)
|
||||
continue;
|
||||
|
||||
cond_resched();
|
||||
|
||||
for_each_jset_key(k, entry, &i->j) {
|
||||
if (keys->nr == keys->size) {
|
||||
__journal_keys_sort(keys);
|
||||
|
||||
if (keys->nr > keys->size * 7 / 8) {
|
||||
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
|
||||
keys->nr, keys->size, nr_read, nr_keys);
|
||||
return -BCH_ERR_ENOMEM_journal_keys_sort;
|
||||
}
|
||||
}
|
||||
|
||||
keys->d[keys->nr++] = (struct journal_key) {
|
||||
.btree_id = entry->btree_id,
|
||||
.level = entry->level,
|
||||
.k = k,
|
||||
.journal_seq = le64_to_cpu(i->j.seq),
|
||||
.journal_offset = k->_data - i->j._data,
|
||||
};
|
||||
|
||||
nr_read++;
|
||||
}
|
||||
}
|
||||
|
||||
__journal_keys_sort(keys);
|
||||
keys->gap = keys->nr;
|
||||
|
||||
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* journal replay: */
|
||||
|
||||
static void replay_now_at(struct journal *j, u64 seq)
|
||||
@ -846,134 +330,6 @@ static int journal_replay_early(struct bch_fs *c,
|
||||
|
||||
/* sb clean section: */
|
||||
|
||||
static struct bkey_i *btree_root_find(struct bch_fs *c,
|
||||
struct bch_sb_field_clean *clean,
|
||||
struct jset *j,
|
||||
enum btree_id id, unsigned *level)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
struct jset_entry *entry, *start, *end;
|
||||
|
||||
if (clean) {
|
||||
start = clean->start;
|
||||
end = vstruct_end(&clean->field);
|
||||
} else {
|
||||
start = j->start;
|
||||
end = vstruct_last(j);
|
||||
}
|
||||
|
||||
for (entry = start; entry < end; entry = vstruct_next(entry))
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_root &&
|
||||
entry->btree_id == id)
|
||||
goto found;
|
||||
|
||||
return NULL;
|
||||
found:
|
||||
if (!entry->u64s)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
k = entry->start;
|
||||
*level = entry->level;
|
||||
return k;
|
||||
}
|
||||
|
||||
static int verify_superblock_clean(struct bch_fs *c,
|
||||
struct bch_sb_field_clean **cleanp,
|
||||
struct jset *j)
|
||||
{
|
||||
unsigned i;
|
||||
struct bch_sb_field_clean *clean = *cleanp;
|
||||
struct printbuf buf1 = PRINTBUF;
|
||||
struct printbuf buf2 = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
|
||||
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
|
||||
le64_to_cpu(clean->journal_seq),
|
||||
le64_to_cpu(j->seq))) {
|
||||
kfree(clean);
|
||||
*cleanp = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
struct bkey_i *k1, *k2;
|
||||
unsigned l1 = 0, l2 = 0;
|
||||
|
||||
k1 = btree_root_find(c, clean, NULL, i, &l1);
|
||||
k2 = btree_root_find(c, NULL, j, i, &l2);
|
||||
|
||||
if (!k1 && !k2)
|
||||
continue;
|
||||
|
||||
printbuf_reset(&buf1);
|
||||
printbuf_reset(&buf2);
|
||||
|
||||
if (k1)
|
||||
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
|
||||
else
|
||||
prt_printf(&buf1, "(none)");
|
||||
|
||||
if (k2)
|
||||
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
|
||||
else
|
||||
prt_printf(&buf2, "(none)");
|
||||
|
||||
mustfix_fsck_err_on(!k1 || !k2 ||
|
||||
IS_ERR(k1) ||
|
||||
IS_ERR(k2) ||
|
||||
k1->k.u64s != k2->k.u64s ||
|
||||
memcmp(k1, k2, bkey_bytes(&k1->k)) ||
|
||||
l1 != l2, c,
|
||||
"superblock btree root %u doesn't match journal after clean shutdown\n"
|
||||
"sb: l=%u %s\n"
|
||||
"journal: l=%u %s\n", i,
|
||||
l1, buf1.buf,
|
||||
l2, buf2.buf);
|
||||
}
|
||||
fsck_err:
|
||||
printbuf_exit(&buf2);
|
||||
printbuf_exit(&buf1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_clean *clean, *sb_clean;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
|
||||
|
||||
if (fsck_err_on(!sb_clean, c,
|
||||
"superblock marked clean but clean section not present")) {
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
||||
c->sb.clean = false;
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
|
||||
GFP_KERNEL);
|
||||
if (!clean) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
|
||||
}
|
||||
|
||||
ret = bch2_sb_clean_validate_late(c, clean, READ);
|
||||
if (ret) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return clean;
|
||||
fsck_err:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static bool btree_id_is_alloc(enum btree_id id)
|
||||
{
|
||||
switch (id) {
|
||||
@ -1120,6 +476,35 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char * const bch2_recovery_passes[] = {
|
||||
#define x(_fn, _when) #_fn,
|
||||
BCH_RECOVERY_PASSES()
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
static int bch2_check_allocations(struct bch_fs *c)
|
||||
{
|
||||
return bch2_gc(c, true, c->opts.norecovery);
|
||||
}
|
||||
|
||||
static int bch2_set_may_go_rw(struct bch_fs *c)
|
||||
{
|
||||
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct recovery_pass_fn {
|
||||
int (*fn)(struct bch_fs *);
|
||||
unsigned when;
|
||||
};
|
||||
|
||||
static struct recovery_pass_fn recovery_pass_fns[] = {
|
||||
#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when },
|
||||
BCH_RECOVERY_PASSES()
|
||||
#undef x
|
||||
};
|
||||
|
||||
static void check_version_upgrade(struct bch_fs *c)
|
||||
{
|
||||
unsigned latest_compatible = bch2_version_compatible(c->sb.version);
|
||||
@ -1172,7 +557,12 @@ static void check_version_upgrade(struct bch_fs *c)
|
||||
|
||||
recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
|
||||
if (recovery_passes) {
|
||||
prt_str(&buf, "fsck required");
|
||||
if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
|
||||
prt_str(&buf, "fsck required");
|
||||
else {
|
||||
prt_str(&buf, "running recovery passses: ");
|
||||
prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
|
||||
}
|
||||
|
||||
c->recovery_passes_explicit |= recovery_passes;
|
||||
c->opts.fix_errors = FSCK_FIX_yes;
|
||||
@ -1188,42 +578,19 @@ static void check_version_upgrade(struct bch_fs *c)
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_check_allocations(struct bch_fs *c)
|
||||
{
|
||||
return bch2_gc(c, true, c->opts.norecovery);
|
||||
}
|
||||
|
||||
static int bch2_set_may_go_rw(struct bch_fs *c)
|
||||
{
|
||||
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct recovery_pass_fn {
|
||||
int (*fn)(struct bch_fs *);
|
||||
const char *name;
|
||||
unsigned when;
|
||||
};
|
||||
|
||||
static struct recovery_pass_fn recovery_passes[] = {
|
||||
#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when },
|
||||
BCH_RECOVERY_PASSES()
|
||||
#undef x
|
||||
};
|
||||
|
||||
u64 bch2_fsck_recovery_passes(void)
|
||||
{
|
||||
u64 ret = 0;
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
|
||||
if (recovery_passes[i].when & PASS_FSCK)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
|
||||
if (recovery_pass_fns[i].when & PASS_FSCK)
|
||||
ret |= BIT_ULL(i);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
|
||||
{
|
||||
struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
|
||||
struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
|
||||
|
||||
if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
|
||||
return false;
|
||||
@ -1245,15 +612,18 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
|
||||
c->curr_recovery_pass = pass;
|
||||
|
||||
if (should_run_recovery_pass(c, pass)) {
|
||||
struct recovery_pass_fn *p = recovery_passes + pass;
|
||||
struct recovery_pass_fn *p = recovery_pass_fns + pass;
|
||||
|
||||
if (!(p->when & PASS_SILENT))
|
||||
printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
|
||||
printk(KERN_INFO bch2_log_msg(c, "%s..."),
|
||||
bch2_recovery_passes[pass]);
|
||||
ret = p->fn(c);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (!(p->when & PASS_SILENT))
|
||||
printk(KERN_CONT " done\n");
|
||||
|
||||
c->recovery_passes_complete |= BIT_ULL(pass);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1263,7 +633,7 @@ static int bch2_run_recovery_passes(struct bch_fs *c)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
|
||||
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
|
||||
ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
|
||||
if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
|
||||
continue;
|
||||
@ -1283,17 +653,17 @@ int bch2_fs_recovery(struct bch_fs *c)
|
||||
bool write_sb = false;
|
||||
int ret = 0;
|
||||
|
||||
if (c->sb.clean)
|
||||
clean = read_superblock_clean(c);
|
||||
ret = PTR_ERR_OR_ZERO(clean);
|
||||
if (ret)
|
||||
goto err;
|
||||
if (c->sb.clean) {
|
||||
clean = bch2_read_superblock_clean(c);
|
||||
ret = PTR_ERR_OR_ZERO(clean);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (c->sb.clean)
|
||||
bch_info(c, "recovering from clean shutdown, journal seq %llu",
|
||||
le64_to_cpu(clean->journal_seq));
|
||||
else
|
||||
} else {
|
||||
bch_info(c, "recovering from unclean shutdown");
|
||||
}
|
||||
|
||||
if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
|
||||
bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
|
||||
@ -1308,12 +678,6 @@ int bch2_fs_recovery(struct bch_fs *c)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
|
||||
bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
|
||||
check_version_upgrade(c);
|
||||
|
||||
@ -1373,12 +737,12 @@ int bch2_fs_recovery(struct bch_fs *c)
|
||||
}
|
||||
}
|
||||
|
||||
ret = journal_keys_sort(c);
|
||||
ret = bch2_journal_keys_sort(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (c->sb.clean && last_journal_entry) {
|
||||
ret = verify_superblock_clean(c, &clean,
|
||||
ret = bch2_verify_superblock_clean(c, &clean,
|
||||
last_journal_entry);
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -1513,7 +877,6 @@ use_clean:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
|
||||
!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
|
||||
c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
|
||||
struct bch_move_stats stats;
|
||||
|
||||
@ -1581,7 +944,7 @@ int bch2_fs_initialize(struct bch_fs *c)
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
|
||||
c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
|
||||
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
|
||||
set_bit(BCH_FS_FSCK_DONE, &c->flags);
|
||||
|
||||
|
@ -2,55 +2,28 @@
|
||||
#ifndef _BCACHEFS_RECOVERY_H
|
||||
#define _BCACHEFS_RECOVERY_H
|
||||
|
||||
struct journal_iter {
|
||||
struct list_head list;
|
||||
enum btree_id btree_id;
|
||||
unsigned level;
|
||||
size_t idx;
|
||||
struct journal_keys *keys;
|
||||
};
|
||||
extern const char * const bch2_recovery_passes[];
|
||||
|
||||
/*
|
||||
* Iterate over keys in the btree, with keys from the journal overlaid on top:
|
||||
* For when we need to rewind recovery passes and run a pass we skipped:
|
||||
*/
|
||||
static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
||||
enum bch_recovery_pass pass)
|
||||
{
|
||||
bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
|
||||
bch2_recovery_passes[pass], pass,
|
||||
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
|
||||
|
||||
struct btree_and_journal_iter {
|
||||
struct btree *b;
|
||||
struct btree_node_iter node_iter;
|
||||
struct bkey unpacked;
|
||||
c->recovery_passes_explicit |= BIT_ULL(pass);
|
||||
|
||||
struct journal_iter journal;
|
||||
struct bpos pos;
|
||||
bool at_end;
|
||||
};
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
|
||||
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bkey_i *);
|
||||
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bkey_i *);
|
||||
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
|
||||
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
|
||||
|
||||
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
|
||||
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
|
||||
struct bch_fs *, struct btree *,
|
||||
struct btree_node_iter, struct bpos);
|
||||
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
|
||||
struct bch_fs *,
|
||||
struct btree *);
|
||||
|
||||
void bch2_journal_keys_free(struct journal_keys *);
|
||||
void bch2_journal_entries_free(struct bch_fs *);
|
||||
if (c->curr_recovery_pass >= pass) {
|
||||
c->curr_recovery_pass = pass;
|
||||
c->recovery_passes_complete &= (1ULL << pass) >> 1;
|
||||
return -BCH_ERR_restart_recovery;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
u64 bch2_fsck_recovery_passes(void);
|
||||
|
||||
|
395
libbcachefs/sb-clean.c
Normal file
395
libbcachefs/sb-clean.c
Normal file
@ -0,0 +1,395 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "buckets.h"
|
||||
#include "error.h"
|
||||
#include "journal_io.h"
|
||||
#include "replicas.h"
|
||||
#include "sb-clean.h"
|
||||
#include "super-io.h"
|
||||
|
||||
/*
|
||||
* BCH_SB_FIELD_clean:
|
||||
*
|
||||
* Btree roots, and a few other things, are recovered from the journal after an
|
||||
* unclean shutdown - but after a clean shutdown, to avoid having to read the
|
||||
* journal, we can store them in the superblock.
|
||||
*
|
||||
* bch_sb_field_clean simply contains a list of journal entries, stored exactly
|
||||
* as they would be in the journal:
|
||||
*/
|
||||
|
||||
int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
|
||||
int write)
|
||||
{
|
||||
struct jset_entry *entry;
|
||||
int ret;
|
||||
|
||||
for (entry = clean->start;
|
||||
entry < (struct jset_entry *) vstruct_end(&clean->field);
|
||||
entry = vstruct_next(entry)) {
|
||||
ret = bch2_journal_entry_validate(c, NULL, entry,
|
||||
le16_to_cpu(c->disk_sb.sb->version),
|
||||
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
|
||||
write);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bkey_i *btree_root_find(struct bch_fs *c,
|
||||
struct bch_sb_field_clean *clean,
|
||||
struct jset *j,
|
||||
enum btree_id id, unsigned *level)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
struct jset_entry *entry, *start, *end;
|
||||
|
||||
if (clean) {
|
||||
start = clean->start;
|
||||
end = vstruct_end(&clean->field);
|
||||
} else {
|
||||
start = j->start;
|
||||
end = vstruct_last(j);
|
||||
}
|
||||
|
||||
for (entry = start; entry < end; entry = vstruct_next(entry))
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_root &&
|
||||
entry->btree_id == id)
|
||||
goto found;
|
||||
|
||||
return NULL;
|
||||
found:
|
||||
if (!entry->u64s)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
k = entry->start;
|
||||
*level = entry->level;
|
||||
return k;
|
||||
}
|
||||
|
||||
int bch2_verify_superblock_clean(struct bch_fs *c,
|
||||
struct bch_sb_field_clean **cleanp,
|
||||
struct jset *j)
|
||||
{
|
||||
unsigned i;
|
||||
struct bch_sb_field_clean *clean = *cleanp;
|
||||
struct printbuf buf1 = PRINTBUF;
|
||||
struct printbuf buf2 = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
|
||||
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
|
||||
le64_to_cpu(clean->journal_seq),
|
||||
le64_to_cpu(j->seq))) {
|
||||
kfree(clean);
|
||||
*cleanp = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
struct bkey_i *k1, *k2;
|
||||
unsigned l1 = 0, l2 = 0;
|
||||
|
||||
k1 = btree_root_find(c, clean, NULL, i, &l1);
|
||||
k2 = btree_root_find(c, NULL, j, i, &l2);
|
||||
|
||||
if (!k1 && !k2)
|
||||
continue;
|
||||
|
||||
printbuf_reset(&buf1);
|
||||
printbuf_reset(&buf2);
|
||||
|
||||
if (k1)
|
||||
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
|
||||
else
|
||||
prt_printf(&buf1, "(none)");
|
||||
|
||||
if (k2)
|
||||
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
|
||||
else
|
||||
prt_printf(&buf2, "(none)");
|
||||
|
||||
mustfix_fsck_err_on(!k1 || !k2 ||
|
||||
IS_ERR(k1) ||
|
||||
IS_ERR(k2) ||
|
||||
k1->k.u64s != k2->k.u64s ||
|
||||
memcmp(k1, k2, bkey_bytes(&k1->k)) ||
|
||||
l1 != l2, c,
|
||||
"superblock btree root %u doesn't match journal after clean shutdown\n"
|
||||
"sb: l=%u %s\n"
|
||||
"journal: l=%u %s\n", i,
|
||||
l1, buf1.buf,
|
||||
l2, buf2.buf);
|
||||
}
|
||||
fsck_err:
|
||||
printbuf_exit(&buf2);
|
||||
printbuf_exit(&buf1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_clean *clean, *sb_clean;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
|
||||
|
||||
if (fsck_err_on(!sb_clean, c,
|
||||
"superblock marked clean but clean section not present")) {
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
||||
c->sb.clean = false;
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
|
||||
GFP_KERNEL);
|
||||
if (!clean) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
|
||||
}
|
||||
|
||||
ret = bch2_sb_clean_validate_late(c, clean, READ);
|
||||
if (ret) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return clean;
|
||||
fsck_err:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
|
||||
{
|
||||
struct jset_entry *entry = *end;
|
||||
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
|
||||
|
||||
memset(entry, 0, u64s * sizeof(u64));
|
||||
/*
|
||||
* The u64s field counts from the start of data, ignoring the shared
|
||||
* fields.
|
||||
*/
|
||||
entry->u64s = cpu_to_le16(u64s - 1);
|
||||
|
||||
*end = vstruct_next(*end);
|
||||
return entry;
|
||||
}
|
||||
|
||||
void bch2_journal_super_entries_add_common(struct bch_fs *c,
|
||||
struct jset_entry **end,
|
||||
u64 journal_seq)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
unsigned i, dev;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
if (!journal_seq) {
|
||||
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
||||
bch2_fs_usage_acc_to_base(c, i);
|
||||
} else {
|
||||
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
|
||||
}
|
||||
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_inodes;
|
||||
u->v = cpu_to_le64(c->usage_base->nr_inodes);
|
||||
}
|
||||
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_key_version;
|
||||
u->v = cpu_to_le64(atomic64_read(&c->key_version));
|
||||
}
|
||||
|
||||
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_reserved;
|
||||
u->entry.level = i;
|
||||
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < c->replicas.nr; i++) {
|
||||
struct bch_replicas_entry *e =
|
||||
cpu_replicas_entry(&c->replicas, i);
|
||||
struct jset_entry_data_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
|
||||
struct jset_entry_data_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_data_usage;
|
||||
u->v = cpu_to_le64(c->usage_base->replicas[i]);
|
||||
unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
|
||||
"embedded variable length struct");
|
||||
}
|
||||
|
||||
for_each_member_device(ca, c, dev) {
|
||||
unsigned b = sizeof(struct jset_entry_dev_usage) +
|
||||
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
|
||||
struct jset_entry_dev_usage *u =
|
||||
container_of(jset_entry_init(end, b),
|
||||
struct jset_entry_dev_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_dev_usage;
|
||||
u->dev = cpu_to_le32(dev);
|
||||
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
|
||||
|
||||
for (i = 0; i < BCH_DATA_NR; i++) {
|
||||
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
|
||||
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
|
||||
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
|
||||
}
|
||||
}
|
||||
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
struct jset_entry_clock *clock =
|
||||
container_of(jset_entry_init(end, sizeof(*clock)),
|
||||
struct jset_entry_clock, entry);
|
||||
|
||||
clock->entry.type = BCH_JSET_ENTRY_clock;
|
||||
clock->rw = i;
|
||||
clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_sb_clean_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f,
|
||||
struct printbuf *err)
|
||||
{
|
||||
struct bch_sb_field_clean *clean = field_to_type(f, clean);
|
||||
|
||||
if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
|
||||
prt_printf(err, "wrong size (got %zu should be %zu)",
|
||||
vstruct_bytes(&clean->field), sizeof(*clean));
|
||||
return -BCH_ERR_invalid_sb_clean;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_clean *clean = field_to_type(f, clean);
|
||||
struct jset_entry *entry;
|
||||
|
||||
prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
|
||||
prt_newline(out);
|
||||
|
||||
for (entry = clean->start;
|
||||
entry != vstruct_end(&clean->field);
|
||||
entry = vstruct_next(entry)) {
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
|
||||
!entry->u64s)
|
||||
continue;
|
||||
|
||||
bch2_journal_entry_to_text(out, NULL, entry);
|
||||
prt_newline(out);
|
||||
}
|
||||
}
|
||||
|
||||
const struct bch_sb_field_ops bch_sb_field_ops_clean = {
|
||||
.validate = bch2_sb_clean_validate,
|
||||
.to_text = bch2_sb_clean_to_text,
|
||||
};
|
||||
|
||||
int bch2_fs_mark_dirty(struct bch_fs *c)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Unconditionally write superblock, to verify it hasn't changed before
|
||||
* we go rw:
|
||||
*/
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
||||
|
||||
bch2_sb_maybe_downgrade(c);
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
|
||||
|
||||
ret = bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_fs_mark_clean(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_clean *sb_clean;
|
||||
struct jset_entry *entry;
|
||||
unsigned u64s;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
if (BCH_SB_CLEAN(c->disk_sb.sb))
|
||||
goto out;
|
||||
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
|
||||
|
||||
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
|
||||
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
|
||||
c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
|
||||
c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
|
||||
|
||||
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
|
||||
|
||||
sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
|
||||
if (!sb_clean) {
|
||||
bch_err(c, "error resizing superblock while setting filesystem clean");
|
||||
goto out;
|
||||
}
|
||||
|
||||
sb_clean->flags = 0;
|
||||
sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
|
||||
|
||||
/* Trying to catch outstanding bug: */
|
||||
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
|
||||
|
||||
entry = sb_clean->start;
|
||||
bch2_journal_super_entries_add_common(c, &entry, 0);
|
||||
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
|
||||
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
|
||||
|
||||
memset(entry, 0,
|
||||
vstruct_end(&sb_clean->field) - (void *) entry);
|
||||
|
||||
/*
|
||||
* this should be in the write path, and we should be validating every
|
||||
* superblock section:
|
||||
*/
|
||||
ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
|
||||
if (ret) {
|
||||
bch_err(c, "error writing marking filesystem clean: validate error");
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_write_super(c);
|
||||
out:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
16
libbcachefs/sb-clean.h
Normal file
16
libbcachefs/sb-clean.h
Normal file
@ -0,0 +1,16 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_SB_CLEAN_H
|
||||
#define _BCACHEFS_SB_CLEAN_H
|
||||
|
||||
int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
|
||||
int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
|
||||
struct jset *);
|
||||
struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
|
||||
void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
|
||||
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
|
||||
|
||||
int bch2_fs_mark_dirty(struct bch_fs *);
|
||||
void bch2_fs_mark_clean(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_SB_CLEAN_H */
|
173
libbcachefs/sb-members.c
Normal file
173
libbcachefs/sb-members.c
Normal file
@ -0,0 +1,173 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "disk_groups.h"
|
||||
#include "replicas.h"
|
||||
#include "sb-members.h"
|
||||
#include "super-io.h"
|
||||
|
||||
/* Code for bch_sb_field_members: */
|
||||
|
||||
static int bch2_sb_members_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f,
|
||||
struct printbuf *err)
|
||||
{
|
||||
struct bch_sb_field_members *mi = field_to_type(f, members);
|
||||
unsigned i;
|
||||
|
||||
if ((void *) (mi->members + sb->nr_devices) >
|
||||
vstruct_end(&mi->field)) {
|
||||
prt_printf(err, "too many devices for section size");
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
for (i = 0; i < sb->nr_devices; i++) {
|
||||
struct bch_member *m = mi->members + i;
|
||||
|
||||
if (!bch2_member_exists(m))
|
||||
continue;
|
||||
|
||||
if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
|
||||
prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
|
||||
i, le64_to_cpu(m->nbuckets), LONG_MAX);
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(m->nbuckets) -
|
||||
le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
|
||||
prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
|
||||
i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (le16_to_cpu(m->bucket_size) <
|
||||
le16_to_cpu(sb->block_size)) {
|
||||
prt_printf(err, "device %u: bucket size %u smaller than block size %u",
|
||||
i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (le16_to_cpu(m->bucket_size) <
|
||||
BCH_SB_BTREE_NODE_SIZE(sb)) {
|
||||
prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
|
||||
i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_members *mi = field_to_type(f, members);
|
||||
struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < sb->nr_devices; i++) {
|
||||
struct bch_member *m = mi->members + i;
|
||||
unsigned data_have = bch2_sb_dev_has_data(sb, i);
|
||||
u64 bucket_size = le16_to_cpu(m->bucket_size);
|
||||
u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
|
||||
|
||||
if (!bch2_member_exists(m))
|
||||
continue;
|
||||
|
||||
prt_printf(out, "Device:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%u", i);
|
||||
prt_newline(out);
|
||||
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
prt_printf(out, "UUID:");
|
||||
prt_tab(out);
|
||||
pr_uuid(out, m->uuid.b);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Size:");
|
||||
prt_tab(out);
|
||||
prt_units_u64(out, device_size << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Bucket size:");
|
||||
prt_tab(out);
|
||||
prt_units_u64(out, bucket_size << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "First bucket:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Buckets:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Last mount:");
|
||||
prt_tab(out);
|
||||
if (m->last_mount)
|
||||
pr_time(out, le64_to_cpu(m->last_mount));
|
||||
else
|
||||
prt_printf(out, "(never)");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "State:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%s",
|
||||
BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
|
||||
? bch2_member_states[BCH_MEMBER_STATE(m)]
|
||||
: "unknown");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Label:");
|
||||
prt_tab(out);
|
||||
if (BCH_MEMBER_GROUP(m)) {
|
||||
unsigned idx = BCH_MEMBER_GROUP(m) - 1;
|
||||
|
||||
if (idx < disk_groups_nr(gi))
|
||||
prt_printf(out, "%s (%u)",
|
||||
gi->entries[idx].label, idx);
|
||||
else
|
||||
prt_printf(out, "(bad disk labels section)");
|
||||
} else {
|
||||
prt_printf(out, "(none)");
|
||||
}
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Data allowed:");
|
||||
prt_tab(out);
|
||||
if (BCH_MEMBER_DATA_ALLOWED(m))
|
||||
prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
|
||||
else
|
||||
prt_printf(out, "(none)");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Has data:");
|
||||
prt_tab(out);
|
||||
if (data_have)
|
||||
prt_bitflags(out, bch2_data_types, data_have);
|
||||
else
|
||||
prt_printf(out, "(none)");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Discard:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Freespace initialized:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
|
||||
prt_newline(out);
|
||||
|
||||
printbuf_indent_sub(out, 2);
|
||||
}
|
||||
}
|
||||
|
||||
const struct bch_sb_field_ops bch_sb_field_ops_members = {
|
||||
.validate = bch2_sb_members_validate,
|
||||
.to_text = bch2_sb_members_to_text,
|
||||
};
|
176
libbcachefs/sb-members.h
Normal file
176
libbcachefs/sb-members.h
Normal file
@ -0,0 +1,176 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_SB_MEMBERS_H
|
||||
#define _BCACHEFS_SB_MEMBERS_H
|
||||
|
||||
static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
||||
{
|
||||
return !percpu_ref_is_zero(&ca->io_ref);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
||||
{
|
||||
return bch2_dev_is_online(ca) &&
|
||||
ca->mi.state != BCH_MEMBER_STATE_failed;
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
|
||||
{
|
||||
if (!percpu_ref_tryget(&ca->io_ref))
|
||||
return false;
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
||||
(ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
|
||||
return true;
|
||||
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
|
||||
{
|
||||
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
|
||||
unsigned dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < devs.nr; i++)
|
||||
if (devs.devs[i] == dev)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
|
||||
unsigned dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < devs->nr; i++)
|
||||
if (devs->devs[i] == dev) {
|
||||
array_remove_item(devs->devs, devs->nr, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
|
||||
unsigned dev)
|
||||
{
|
||||
if (!bch2_dev_list_has_dev(*devs, dev)) {
|
||||
BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
|
||||
devs->devs[devs->nr++] = dev;
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
|
||||
{
|
||||
return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
|
||||
}
|
||||
|
||||
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
|
||||
const struct bch_devs_mask *mask)
|
||||
{
|
||||
struct bch_dev *ca = NULL;
|
||||
|
||||
while ((*iter = mask
|
||||
? find_next_bit(mask->d, c->sb.nr_devices, *iter)
|
||||
: *iter) < c->sb.nr_devices &&
|
||||
!(ca = rcu_dereference_check(c->devs[*iter],
|
||||
lockdep_is_held(&c->state_lock))))
|
||||
(*iter)++;
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
#define for_each_member_device_rcu(ca, c, iter, mask) \
|
||||
for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
|
||||
|
||||
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
if ((ca = __bch2_next_dev(c, iter, NULL)))
|
||||
percpu_ref_get(&ca->ref);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
/*
|
||||
* If you break early, you must drop your ref on the current device
|
||||
*/
|
||||
#define for_each_member_device(ca, c, iter) \
|
||||
for ((iter) = 0; \
|
||||
(ca = bch2_get_next_dev(c, &(iter))); \
|
||||
percpu_ref_put(&ca->ref), (iter)++)
|
||||
|
||||
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
|
||||
unsigned *iter,
|
||||
int state_mask)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
while ((ca = __bch2_next_dev(c, iter, NULL)) &&
|
||||
(!((1 << ca->mi.state) & state_mask) ||
|
||||
!percpu_ref_tryget(&ca->io_ref)))
|
||||
(*iter)++;
|
||||
rcu_read_unlock();
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
#define __for_each_online_member(ca, c, iter, state_mask) \
|
||||
for ((iter) = 0; \
|
||||
(ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
|
||||
percpu_ref_put(&ca->io_ref), (iter)++)
|
||||
|
||||
#define for_each_online_member(ca, c, iter) \
|
||||
__for_each_online_member(ca, c, iter, ~0)
|
||||
|
||||
#define for_each_rw_member(ca, c, iter) \
|
||||
__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
|
||||
|
||||
#define for_each_readable_member(ca, c, iter) \
|
||||
__for_each_online_member(ca, c, iter, \
|
||||
(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
|
||||
|
||||
/*
|
||||
* If a key exists that references a device, the device won't be going away and
|
||||
* we can omit rcu_read_lock():
|
||||
*/
|
||||
static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
|
||||
{
|
||||
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
||||
|
||||
return rcu_dereference_check(c->devs[idx], 1);
|
||||
}
|
||||
|
||||
static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
|
||||
{
|
||||
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
||||
|
||||
return rcu_dereference_protected(c->devs[idx],
|
||||
lockdep_is_held(&c->sb_lock) ||
|
||||
lockdep_is_held(&c->state_lock));
|
||||
}
|
||||
|
||||
/* XXX kill, move to struct bch_fs */
|
||||
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
|
||||
{
|
||||
struct bch_devs_mask devs;
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
memset(&devs, 0, sizeof(devs));
|
||||
for_each_online_member(ca, c, i)
|
||||
__set_bit(ca->dev_idx, devs.d);
|
||||
return devs;
|
||||
}
|
||||
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_members;
|
||||
|
||||
#endif /* _BCACHEFS_SB_MEMBERS_H */
|
@ -1,8 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "buckets.h"
|
||||
#include "checksum.h"
|
||||
#include "counters.h"
|
||||
#include "disk_groups.h"
|
||||
@ -10,12 +8,13 @@
|
||||
#include "error.h"
|
||||
#include "io.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
#include "journal_sb.h"
|
||||
#include "journal_seq_blacklist.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
#include "quota.h"
|
||||
#include "sb-clean.h"
|
||||
#include "sb-members.h"
|
||||
#include "super-io.h"
|
||||
#include "super.h"
|
||||
#include "trace.h"
|
||||
@ -1005,235 +1004,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
||||
/* BCH_SB_FIELD_members: */
|
||||
|
||||
static int bch2_sb_members_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f,
|
||||
struct printbuf *err)
|
||||
{
|
||||
struct bch_sb_field_members *mi = field_to_type(f, members);
|
||||
unsigned i;
|
||||
|
||||
if ((void *) (mi->members + sb->nr_devices) >
|
||||
vstruct_end(&mi->field)) {
|
||||
prt_printf(err, "too many devices for section size");
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
for (i = 0; i < sb->nr_devices; i++) {
|
||||
struct bch_member *m = mi->members + i;
|
||||
|
||||
if (!bch2_member_exists(m))
|
||||
continue;
|
||||
|
||||
if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
|
||||
prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
|
||||
i, le64_to_cpu(m->nbuckets), LONG_MAX);
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(m->nbuckets) -
|
||||
le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
|
||||
prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
|
||||
i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (le16_to_cpu(m->bucket_size) <
|
||||
le16_to_cpu(sb->block_size)) {
|
||||
prt_printf(err, "device %u: bucket size %u smaller than block size %u",
|
||||
i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (le16_to_cpu(m->bucket_size) <
|
||||
BCH_SB_BTREE_NODE_SIZE(sb)) {
|
||||
prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
|
||||
i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_members *mi = field_to_type(f, members);
|
||||
struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < sb->nr_devices; i++) {
|
||||
struct bch_member *m = mi->members + i;
|
||||
unsigned data_have = bch2_sb_dev_has_data(sb, i);
|
||||
u64 bucket_size = le16_to_cpu(m->bucket_size);
|
||||
u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
|
||||
|
||||
if (!bch2_member_exists(m))
|
||||
continue;
|
||||
|
||||
prt_printf(out, "Device:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%u", i);
|
||||
prt_newline(out);
|
||||
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
prt_printf(out, "UUID:");
|
||||
prt_tab(out);
|
||||
pr_uuid(out, m->uuid.b);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Size:");
|
||||
prt_tab(out);
|
||||
prt_units_u64(out, device_size << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Bucket size:");
|
||||
prt_tab(out);
|
||||
prt_units_u64(out, bucket_size << 9);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "First bucket:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Buckets:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Last mount:");
|
||||
prt_tab(out);
|
||||
if (m->last_mount)
|
||||
pr_time(out, le64_to_cpu(m->last_mount));
|
||||
else
|
||||
prt_printf(out, "(never)");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "State:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%s",
|
||||
BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
|
||||
? bch2_member_states[BCH_MEMBER_STATE(m)]
|
||||
: "unknown");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Label:");
|
||||
prt_tab(out);
|
||||
if (BCH_MEMBER_GROUP(m)) {
|
||||
unsigned idx = BCH_MEMBER_GROUP(m) - 1;
|
||||
|
||||
if (idx < disk_groups_nr(gi))
|
||||
prt_printf(out, "%s (%u)",
|
||||
gi->entries[idx].label, idx);
|
||||
else
|
||||
prt_printf(out, "(bad disk labels section)");
|
||||
} else {
|
||||
prt_printf(out, "(none)");
|
||||
}
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Data allowed:");
|
||||
prt_tab(out);
|
||||
if (BCH_MEMBER_DATA_ALLOWED(m))
|
||||
prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
|
||||
else
|
||||
prt_printf(out, "(none)");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Has data:");
|
||||
prt_tab(out);
|
||||
if (data_have)
|
||||
prt_bitflags(out, bch2_data_types, data_have);
|
||||
else
|
||||
prt_printf(out, "(none)");
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Discard:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Freespace initialized:");
|
||||
prt_tab(out);
|
||||
prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
|
||||
prt_newline(out);
|
||||
|
||||
printbuf_indent_sub(out, 2);
|
||||
}
|
||||
}
|
||||
|
||||
static const struct bch_sb_field_ops bch_sb_field_ops_members = {
|
||||
.validate = bch2_sb_members_validate,
|
||||
.to_text = bch2_sb_members_to_text,
|
||||
};
|
||||
|
||||
/* BCH_SB_FIELD_crypt: */
|
||||
|
||||
static int bch2_sb_crypt_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f,
|
||||
struct printbuf *err)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
|
||||
|
||||
if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
|
||||
prt_printf(err, "wrong size (got %zu should be %zu)",
|
||||
vstruct_bytes(&crypt->field), sizeof(*crypt));
|
||||
return -BCH_ERR_invalid_sb_crypt;
|
||||
}
|
||||
|
||||
if (BCH_CRYPT_KDF_TYPE(crypt)) {
|
||||
prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
|
||||
return -BCH_ERR_invalid_sb_crypt;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
|
||||
|
||||
prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
|
||||
.validate = bch2_sb_crypt_validate,
|
||||
.to_text = bch2_sb_crypt_to_text,
|
||||
};
|
||||
|
||||
/* BCH_SB_FIELD_clean: */
|
||||
|
||||
int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
|
||||
{
|
||||
struct jset_entry *entry;
|
||||
int ret;
|
||||
|
||||
for (entry = clean->start;
|
||||
entry < (struct jset_entry *) vstruct_end(&clean->field);
|
||||
entry = vstruct_next(entry)) {
|
||||
ret = bch2_journal_entry_validate(c, NULL, entry,
|
||||
le16_to_cpu(c->disk_sb.sb->version),
|
||||
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
|
||||
write);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Downgrade if superblock is at a higher version than currently supported: */
|
||||
void bch2_sb_maybe_downgrade(struct bch_fs *c)
|
||||
{
|
||||
@ -1260,232 +1030,6 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
|
||||
}
|
||||
|
||||
int bch2_fs_mark_dirty(struct bch_fs *c)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Unconditionally write superblock, to verify it hasn't changed before
|
||||
* we go rw:
|
||||
*/
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
||||
|
||||
bch2_sb_maybe_downgrade(c);
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
|
||||
|
||||
ret = bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
|
||||
{
|
||||
struct jset_entry *entry = *end;
|
||||
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
|
||||
|
||||
memset(entry, 0, u64s * sizeof(u64));
|
||||
/*
|
||||
* The u64s field counts from the start of data, ignoring the shared
|
||||
* fields.
|
||||
*/
|
||||
entry->u64s = cpu_to_le16(u64s - 1);
|
||||
|
||||
*end = vstruct_next(*end);
|
||||
return entry;
|
||||
}
|
||||
|
||||
void bch2_journal_super_entries_add_common(struct bch_fs *c,
|
||||
struct jset_entry **end,
|
||||
u64 journal_seq)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
unsigned i, dev;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
if (!journal_seq) {
|
||||
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
||||
bch2_fs_usage_acc_to_base(c, i);
|
||||
} else {
|
||||
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
|
||||
}
|
||||
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_inodes;
|
||||
u->v = cpu_to_le64(c->usage_base->nr_inodes);
|
||||
}
|
||||
|
||||
{
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_key_version;
|
||||
u->v = cpu_to_le64(atomic64_read(&c->key_version));
|
||||
}
|
||||
|
||||
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
|
||||
struct jset_entry_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u)),
|
||||
struct jset_entry_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_usage;
|
||||
u->entry.btree_id = BCH_FS_USAGE_reserved;
|
||||
u->entry.level = i;
|
||||
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < c->replicas.nr; i++) {
|
||||
struct bch_replicas_entry *e =
|
||||
cpu_replicas_entry(&c->replicas, i);
|
||||
struct jset_entry_data_usage *u =
|
||||
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
|
||||
struct jset_entry_data_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_data_usage;
|
||||
u->v = cpu_to_le64(c->usage_base->replicas[i]);
|
||||
unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
|
||||
"embedded variable length struct");
|
||||
}
|
||||
|
||||
for_each_member_device(ca, c, dev) {
|
||||
unsigned b = sizeof(struct jset_entry_dev_usage) +
|
||||
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
|
||||
struct jset_entry_dev_usage *u =
|
||||
container_of(jset_entry_init(end, b),
|
||||
struct jset_entry_dev_usage, entry);
|
||||
|
||||
u->entry.type = BCH_JSET_ENTRY_dev_usage;
|
||||
u->dev = cpu_to_le32(dev);
|
||||
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
|
||||
|
||||
for (i = 0; i < BCH_DATA_NR; i++) {
|
||||
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
|
||||
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
|
||||
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
|
||||
}
|
||||
}
|
||||
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
struct jset_entry_clock *clock =
|
||||
container_of(jset_entry_init(end, sizeof(*clock)),
|
||||
struct jset_entry_clock, entry);
|
||||
|
||||
clock->entry.type = BCH_JSET_ENTRY_clock;
|
||||
clock->rw = i;
|
||||
clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_fs_mark_clean(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_clean *sb_clean;
|
||||
struct jset_entry *entry;
|
||||
unsigned u64s;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
if (BCH_SB_CLEAN(c->disk_sb.sb))
|
||||
goto out;
|
||||
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
|
||||
|
||||
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
|
||||
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
|
||||
c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
|
||||
c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
|
||||
|
||||
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
|
||||
|
||||
sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
|
||||
if (!sb_clean) {
|
||||
bch_err(c, "error resizing superblock while setting filesystem clean");
|
||||
goto out;
|
||||
}
|
||||
|
||||
sb_clean->flags = 0;
|
||||
sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
|
||||
|
||||
/* Trying to catch outstanding bug: */
|
||||
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
|
||||
|
||||
entry = sb_clean->start;
|
||||
bch2_journal_super_entries_add_common(c, &entry, 0);
|
||||
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
|
||||
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
|
||||
|
||||
memset(entry, 0,
|
||||
vstruct_end(&sb_clean->field) - (void *) entry);
|
||||
|
||||
/*
|
||||
* this should be in the write path, and we should be validating every
|
||||
* superblock section:
|
||||
*/
|
||||
ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
|
||||
if (ret) {
|
||||
bch_err(c, "error writing marking filesystem clean: validate error");
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_write_super(c);
|
||||
out:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
||||
static int bch2_sb_clean_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f,
|
||||
struct printbuf *err)
|
||||
{
|
||||
struct bch_sb_field_clean *clean = field_to_type(f, clean);
|
||||
|
||||
if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
|
||||
prt_printf(err, "wrong size (got %zu should be %zu)",
|
||||
vstruct_bytes(&clean->field), sizeof(*clean));
|
||||
return -BCH_ERR_invalid_sb_clean;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_clean *clean = field_to_type(f, clean);
|
||||
struct jset_entry *entry;
|
||||
|
||||
prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
|
||||
prt_newline(out);
|
||||
prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
|
||||
prt_newline(out);
|
||||
|
||||
for (entry = clean->start;
|
||||
entry != vstruct_end(&clean->field);
|
||||
entry = vstruct_next(entry)) {
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
|
||||
!entry->u64s)
|
||||
continue;
|
||||
|
||||
bch2_journal_entry_to_text(out, NULL, entry);
|
||||
prt_newline(out);
|
||||
}
|
||||
}
|
||||
|
||||
static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
|
||||
.validate = bch2_sb_clean_validate,
|
||||
.to_text = bch2_sb_clean_to_text,
|
||||
};
|
||||
|
||||
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
|
||||
#define x(f, nr) \
|
||||
[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
|
||||
|
@ -121,19 +121,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
|
||||
};
|
||||
}
|
||||
|
||||
/* BCH_SB_FIELD_clean: */
|
||||
|
||||
void bch2_journal_super_entries_add_common(struct bch_fs *,
|
||||
struct jset_entry **, u64);
|
||||
|
||||
int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
|
||||
|
||||
void bch2_sb_maybe_downgrade(struct bch_fs *);
|
||||
void bch2_sb_upgrade(struct bch_fs *, unsigned);
|
||||
|
||||
int bch2_fs_mark_dirty(struct bch_fs *);
|
||||
void bch2_fs_mark_clean(struct bch_fs *);
|
||||
|
||||
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
|
||||
struct bch_sb_field *);
|
||||
void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "bkey_sort.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_journal_iter.h"
|
||||
#include "btree_key_cache.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_io.h"
|
||||
@ -30,6 +31,8 @@
|
||||
#include "error.h"
|
||||
#include "fs.h"
|
||||
#include "fs-io.h"
|
||||
#include "fs-io-buffered.h"
|
||||
#include "fs-io-direct.h"
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
@ -44,6 +47,7 @@
|
||||
#include "rebalance.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
#include "sb-clean.h"
|
||||
#include "subvolume.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
@ -469,6 +473,8 @@ static void __bch2_fs_free(struct bch_fs *c)
|
||||
bch2_fs_counters_exit(c);
|
||||
bch2_fs_snapshots_exit(c);
|
||||
bch2_fs_quota_exit(c);
|
||||
bch2_fs_fs_io_direct_exit(c);
|
||||
bch2_fs_fs_io_buffered_exit(c);
|
||||
bch2_fs_fsio_exit(c);
|
||||
bch2_fs_ec_exit(c);
|
||||
bch2_fs_encryption_exit(c);
|
||||
@ -844,7 +850,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
bch2_fs_encryption_init(c) ?:
|
||||
bch2_fs_compress_init(c) ?:
|
||||
bch2_fs_ec_init(c) ?:
|
||||
bch2_fs_fsio_init(c);
|
||||
bch2_fs_fsio_init(c) ?:
|
||||
bch2_fs_fs_io_buffered_init(c);
|
||||
bch2_fs_fs_io_direct_init(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -2000,6 +2008,7 @@ err:
|
||||
BCH_DEBUG_PARAMS()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
__maybe_unused
|
||||
static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
|
||||
module_param_named(version, bch2_metadata_version, uint, 0400);
|
||||
|
||||
|
@ -8,220 +8,6 @@
|
||||
|
||||
#include <linux/math64.h>
|
||||
|
||||
static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
|
||||
{
|
||||
return div_u64(s, ca->mi.bucket_size);
|
||||
}
|
||||
|
||||
static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
|
||||
{
|
||||
return ((sector_t) b) * ca->mi.bucket_size;
|
||||
}
|
||||
|
||||
static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
|
||||
{
|
||||
u32 remainder;
|
||||
|
||||
div_u64_rem(s, ca->mi.bucket_size, &remainder);
|
||||
return remainder;
|
||||
}
|
||||
|
||||
static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
|
||||
u32 *offset)
|
||||
{
|
||||
return div_u64_rem(s, ca->mi.bucket_size, offset);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
||||
{
|
||||
return !percpu_ref_is_zero(&ca->io_ref);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
||||
{
|
||||
return bch2_dev_is_online(ca) &&
|
||||
ca->mi.state != BCH_MEMBER_STATE_failed;
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
|
||||
{
|
||||
if (!percpu_ref_tryget(&ca->io_ref))
|
||||
return false;
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
||||
(ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
|
||||
return true;
|
||||
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
|
||||
{
|
||||
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
|
||||
unsigned dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < devs.nr; i++)
|
||||
if (devs.devs[i] == dev)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
|
||||
unsigned dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < devs->nr; i++)
|
||||
if (devs->devs[i] == dev) {
|
||||
array_remove_item(devs->devs, devs->nr, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
|
||||
unsigned dev)
|
||||
{
|
||||
if (!bch2_dev_list_has_dev(*devs, dev)) {
|
||||
BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
|
||||
devs->devs[devs->nr++] = dev;
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
|
||||
{
|
||||
return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
|
||||
}
|
||||
|
||||
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
|
||||
const struct bch_devs_mask *mask)
|
||||
{
|
||||
struct bch_dev *ca = NULL;
|
||||
|
||||
while ((*iter = mask
|
||||
? find_next_bit(mask->d, c->sb.nr_devices, *iter)
|
||||
: *iter) < c->sb.nr_devices &&
|
||||
!(ca = rcu_dereference_check(c->devs[*iter],
|
||||
lockdep_is_held(&c->state_lock))))
|
||||
(*iter)++;
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
#define for_each_member_device_rcu(ca, c, iter, mask) \
|
||||
for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
|
||||
|
||||
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
if ((ca = __bch2_next_dev(c, iter, NULL)))
|
||||
percpu_ref_get(&ca->ref);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
/*
|
||||
* If you break early, you must drop your ref on the current device
|
||||
*/
|
||||
#define for_each_member_device(ca, c, iter) \
|
||||
for ((iter) = 0; \
|
||||
(ca = bch2_get_next_dev(c, &(iter))); \
|
||||
percpu_ref_put(&ca->ref), (iter)++)
|
||||
|
||||
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
|
||||
unsigned *iter,
|
||||
int state_mask)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
while ((ca = __bch2_next_dev(c, iter, NULL)) &&
|
||||
(!((1 << ca->mi.state) & state_mask) ||
|
||||
!percpu_ref_tryget(&ca->io_ref)))
|
||||
(*iter)++;
|
||||
rcu_read_unlock();
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
#define __for_each_online_member(ca, c, iter, state_mask) \
|
||||
for ((iter) = 0; \
|
||||
(ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
|
||||
percpu_ref_put(&ca->io_ref), (iter)++)
|
||||
|
||||
#define for_each_online_member(ca, c, iter) \
|
||||
__for_each_online_member(ca, c, iter, ~0)
|
||||
|
||||
#define for_each_rw_member(ca, c, iter) \
|
||||
__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
|
||||
|
||||
#define for_each_readable_member(ca, c, iter) \
|
||||
__for_each_online_member(ca, c, iter, \
|
||||
(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
|
||||
|
||||
/*
|
||||
* If a key exists that references a device, the device won't be going away and
|
||||
* we can omit rcu_read_lock():
|
||||
*/
|
||||
static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
|
||||
{
|
||||
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
||||
|
||||
return rcu_dereference_check(c->devs[idx], 1);
|
||||
}
|
||||
|
||||
static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
|
||||
{
|
||||
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
||||
|
||||
return rcu_dereference_protected(c->devs[idx],
|
||||
lockdep_is_held(&c->sb_lock) ||
|
||||
lockdep_is_held(&c->state_lock));
|
||||
}
|
||||
|
||||
/* XXX kill, move to struct bch_fs */
|
||||
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
|
||||
{
|
||||
struct bch_devs_mask devs;
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
memset(&devs, 0, sizeof(devs));
|
||||
for_each_online_member(ca, c, i)
|
||||
__set_bit(ca->dev_idx, devs.d);
|
||||
return devs;
|
||||
}
|
||||
|
||||
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
|
||||
{
|
||||
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
||||
u64 b_offset = bucket_to_sector(ca, b);
|
||||
u64 b_end = bucket_to_sector(ca, b + 1);
|
||||
unsigned i;
|
||||
|
||||
if (!b)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < layout->nr_superblocks; i++) {
|
||||
u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
||||
u64 end = offset + (1 << layout->sb_max_size_bits);
|
||||
|
||||
if (!(offset >= b_end || end <= b_offset))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
struct bch_fs *bch2_dev_to_fs(dev_t);
|
||||
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user