mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-22 00:00:03 +03:00
Update bcachefs sources to 9ceb982d77 bcachefs: Store bucket gens in a btree
This commit is contained in:
parent
e57a624feb
commit
63065c0128
@ -1 +1 @@
|
||||
4231dd5cf0f04dd61b0b8bae44a357da8331c0e2
|
||||
9ceb982d7790f552e2f5c96bebeab176516cf144
|
||||
|
@ -55,12 +55,6 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
|
||||
bucket_bytes(ca));
|
||||
}
|
||||
|
||||
/* Prios/gens: */
|
||||
for (i = 0; i < prio_buckets(ca); i++)
|
||||
range_add(&data,
|
||||
bucket_bytes(ca) * ca->prio_last_buckets[i],
|
||||
bucket_bytes(ca));
|
||||
|
||||
/* Btree: */
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
const struct bch_extent_ptr *ptr;
|
||||
@ -97,6 +91,7 @@ int cmd_dump(int argc, char *argv[])
|
||||
opts.nochanges = true;
|
||||
opts.noreplay = true;
|
||||
opts.errors = BCH_ON_ERROR_CONTINUE;
|
||||
opts.degraded = true;
|
||||
|
||||
while ((opt = getopt(argc, argv, "o:fh")) != -1)
|
||||
switch (opt) {
|
||||
@ -273,6 +268,7 @@ int cmd_list(int argc, char *argv[])
|
||||
opts.nochanges = true;
|
||||
opts.norecovery = true;
|
||||
opts.errors = BCH_ON_ERROR_CONTINUE;
|
||||
opts.degraded = true;
|
||||
|
||||
while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
|
||||
switch (opt) {
|
||||
|
@ -27,6 +27,8 @@ int cmd_fsck(int argc, char *argv[])
|
||||
const char *err;
|
||||
int opt;
|
||||
|
||||
opts.degraded = true;
|
||||
|
||||
while ((opt = getopt(argc, argv, "pynfvh")) != -1)
|
||||
switch (opt) {
|
||||
case 'p':
|
||||
|
@ -333,7 +333,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
|
||||
die("error reserving space in new filesystem: %s",
|
||||
strerror(-ret));
|
||||
|
||||
bch2_check_mark_super(c, &e->k_i, false);
|
||||
bch2_check_mark_super(c, extent_i_to_s_c(e), false);
|
||||
|
||||
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
|
||||
&res, NULL, NULL, 0);
|
||||
|
@ -112,6 +112,11 @@ static inline unsigned long hweight_long(unsigned long w)
|
||||
return __builtin_popcountl(w);
|
||||
}
|
||||
|
||||
static inline unsigned long hweight8(unsigned long w)
|
||||
{
|
||||
return __builtin_popcountl(w);
|
||||
}
|
||||
|
||||
/**
|
||||
* rol64 - rotate a 64-bit value left
|
||||
* @word: value to rotate
|
||||
|
@ -176,10 +176,8 @@ struct bch_sb *bch2_format(struct format_opts opts,
|
||||
SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size);
|
||||
SET_BCH_SB_GC_RESERVE(sb, 8);
|
||||
SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas);
|
||||
SET_BCH_SB_META_REPLICAS_HAVE(sb, opts.meta_replicas);
|
||||
SET_BCH_SB_META_REPLICAS_REQ(sb, opts.meta_replicas_required);
|
||||
SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas);
|
||||
SET_BCH_SB_DATA_REPLICAS_HAVE(sb, opts.data_replicas);
|
||||
SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required);
|
||||
SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action);
|
||||
SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH);
|
||||
@ -339,9 +337,9 @@ void bch2_super_print(struct bch_sb *sb, int units)
|
||||
|
||||
BCH_SB_CLEAN(sb),
|
||||
|
||||
BCH_SB_META_REPLICAS_HAVE(sb),
|
||||
0LLU, //BCH_SB_META_REPLICAS_HAVE(sb),
|
||||
BCH_SB_META_REPLICAS_WANT(sb),
|
||||
BCH_SB_DATA_REPLICAS_HAVE(sb),
|
||||
0LLU, //BCH_SB_DATA_REPLICAS_HAVE(sb),
|
||||
BCH_SB_DATA_REPLICAS_WANT(sb),
|
||||
|
||||
BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR
|
||||
@ -405,8 +403,8 @@ void bch2_super_print(struct bch_sb *sb, int units)
|
||||
: "unknown",
|
||||
|
||||
BCH_MEMBER_TIER(m),
|
||||
BCH_MEMBER_HAS_METADATA(m),
|
||||
BCH_MEMBER_HAS_DATA(m),
|
||||
0LLU, //BCH_MEMBER_HAS_METADATA(m),
|
||||
0LLU, //BCH_MEMBER_HAS_DATA(m),
|
||||
|
||||
BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
|
||||
? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
|
||||
|
1004
libbcachefs/alloc.c
1004
libbcachefs/alloc.c
File diff suppressed because it is too large
Load Diff
@ -10,24 +10,14 @@ struct bch_dev;
|
||||
struct bch_fs;
|
||||
struct dev_group;
|
||||
|
||||
static inline size_t prios_per_bucket(const struct bch_dev *ca)
|
||||
{
|
||||
return (bucket_bytes(ca) - sizeof(struct prio_set)) /
|
||||
sizeof(struct bucket_disk);
|
||||
}
|
||||
|
||||
static inline size_t prio_buckets(const struct bch_dev *ca)
|
||||
{
|
||||
return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
|
||||
}
|
||||
|
||||
void bch2_dev_group_remove(struct dev_group *, struct bch_dev *);
|
||||
void bch2_dev_group_add(struct dev_group *, struct bch_dev *);
|
||||
|
||||
int bch2_prio_read(struct bch_dev *);
|
||||
int bch2_prio_write(struct bch_dev *);
|
||||
int bch2_alloc_read(struct bch_fs *, struct list_head *);
|
||||
int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *);
|
||||
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
|
||||
|
||||
size_t bch2_bucket_alloc(struct bch_dev *, enum alloc_reserve);
|
||||
long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
|
||||
|
||||
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
||||
|
||||
@ -80,8 +70,15 @@ static inline struct bch_dev *dev_group_next(struct dev_group *devs,
|
||||
(_ptr)++)
|
||||
|
||||
void bch2_recalc_capacity(struct bch_fs *);
|
||||
|
||||
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
void bch2_dev_allocator_stop(struct bch_dev *);
|
||||
int bch2_dev_allocator_start(struct bch_dev *);
|
||||
|
||||
void bch2_fs_allocator_init(struct bch_fs *);
|
||||
|
||||
extern const struct bkey_ops bch2_bkey_alloc_ops;
|
||||
|
||||
#endif /* _BCACHE_ALLOC_H */
|
||||
|
@ -35,20 +35,13 @@ struct prio_clock {
|
||||
/* There is one reserve for each type of btree, one for prios and gens
|
||||
* and one for moving GC */
|
||||
enum alloc_reserve {
|
||||
RESERVE_PRIO,
|
||||
RESERVE_BTREE,
|
||||
RESERVE_METADATA_LAST = RESERVE_BTREE,
|
||||
RESERVE_MOVINGGC,
|
||||
|
||||
RESERVE_NONE,
|
||||
RESERVE_NR,
|
||||
RESERVE_ALLOC = -1,
|
||||
RESERVE_BTREE = 0,
|
||||
RESERVE_MOVINGGC = 1,
|
||||
RESERVE_NONE = 2,
|
||||
RESERVE_NR = 3,
|
||||
};
|
||||
|
||||
static inline bool allocation_is_metadata(enum alloc_reserve id)
|
||||
{
|
||||
return id <= RESERVE_METADATA_LAST;
|
||||
}
|
||||
|
||||
struct dev_group {
|
||||
spinlock_t lock;
|
||||
unsigned nr;
|
||||
|
@ -305,7 +305,7 @@ do { \
|
||||
(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
|
||||
|
||||
/* Size of the freelist we allocate btree nodes from: */
|
||||
#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2)
|
||||
#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
|
||||
|
||||
struct btree;
|
||||
struct crypto_blkcipher;
|
||||
@ -329,13 +329,23 @@ struct bch_member_cpu {
|
||||
u16 bucket_size; /* sectors */
|
||||
u8 state;
|
||||
u8 tier;
|
||||
u8 has_metadata;
|
||||
u8 has_data;
|
||||
u8 replacement;
|
||||
u8 discard;
|
||||
u8 valid;
|
||||
};
|
||||
|
||||
struct bch_replicas_cpu_entry {
|
||||
u8 data_type;
|
||||
u8 devs[BCH_SB_MEMBERS_MAX / 8];
|
||||
};
|
||||
|
||||
struct bch_replicas_cpu {
|
||||
struct rcu_head rcu;
|
||||
unsigned nr;
|
||||
unsigned entry_size;
|
||||
struct bch_replicas_cpu_entry entries[];
|
||||
};
|
||||
|
||||
struct bch_dev {
|
||||
struct kobject kobj;
|
||||
struct percpu_ref ref;
|
||||
@ -363,21 +373,7 @@ struct bch_dev {
|
||||
|
||||
struct task_struct *alloc_thread;
|
||||
|
||||
struct prio_set *disk_buckets;
|
||||
|
||||
/*
|
||||
* When allocating new buckets, prio_write() gets first dibs - since we
|
||||
* may not be allocate at all without writing priorities and gens.
|
||||
* prio_last_buckets[] contains the last buckets we wrote priorities to
|
||||
* (so gc can mark them as metadata).
|
||||
*/
|
||||
u64 *prio_buckets;
|
||||
u64 *prio_last_buckets;
|
||||
spinlock_t prio_buckets_lock;
|
||||
struct bio *bio_prio;
|
||||
bool prio_read_done;
|
||||
bool need_prio_write;
|
||||
struct mutex prio_write_lock;
|
||||
bool need_alloc_write;
|
||||
|
||||
/*
|
||||
* free: Buckets that are ready to be used
|
||||
@ -391,6 +387,7 @@ struct bch_dev {
|
||||
DECLARE_FIFO(long, free)[RESERVE_NR];
|
||||
DECLARE_FIFO(long, free_inc);
|
||||
spinlock_t freelist_lock;
|
||||
bool alloc_thread_started;
|
||||
|
||||
size_t fifo_last_bucket;
|
||||
|
||||
@ -415,6 +412,8 @@ struct bch_dev {
|
||||
atomic_long_t saturated_count;
|
||||
size_t inc_gen_needs_gc;
|
||||
size_t inc_gen_really_needs_gc;
|
||||
u64 allocator_journal_seq_flush;
|
||||
bool allocator_invalidating_data;
|
||||
|
||||
alloc_heap alloc_heap;
|
||||
bucket_heap copygc_heap;
|
||||
@ -458,6 +457,7 @@ enum {
|
||||
BCH_FS_FSCK_FIXED_ERRORS,
|
||||
BCH_FS_FSCK_DONE,
|
||||
BCH_FS_FIXED_GENS,
|
||||
BCH_FS_REBUILD_REPLICAS,
|
||||
};
|
||||
|
||||
struct btree_debug {
|
||||
@ -507,6 +507,10 @@ struct bch_fs {
|
||||
|
||||
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
|
||||
|
||||
struct bch_replicas_cpu __rcu *replicas;
|
||||
struct bch_replicas_cpu __rcu *replicas_gc;
|
||||
struct mutex replicas_gc_lock;
|
||||
|
||||
struct bch_opts opts;
|
||||
|
||||
/* Updated by bch2_sb_update():*/
|
||||
@ -520,9 +524,6 @@ struct bch_fs {
|
||||
u8 nr_devices;
|
||||
u8 clean;
|
||||
|
||||
u8 meta_replicas_have;
|
||||
u8 data_replicas_have;
|
||||
|
||||
u8 str_hash_type;
|
||||
u8 encryption_type;
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
#define _BCACHEFS_FORMAT_H
|
||||
|
||||
/*
|
||||
* Bcache on disk data structures
|
||||
* bcachefs on disk data structures
|
||||
*/
|
||||
|
||||
#include <asm/types.h>
|
||||
@ -714,6 +714,25 @@ struct bch_xattr {
|
||||
} __attribute__((packed, aligned(8)));
|
||||
BKEY_VAL_TYPE(xattr, BCH_XATTR);
|
||||
|
||||
/* Bucket/allocation information: */
|
||||
|
||||
enum {
|
||||
BCH_ALLOC = 128,
|
||||
};
|
||||
|
||||
enum {
|
||||
BCH_ALLOC_FIELD_READ_TIME = 0,
|
||||
BCH_ALLOC_FIELD_WRITE_TIME = 1,
|
||||
};
|
||||
|
||||
struct bch_alloc {
|
||||
struct bch_val v;
|
||||
__u8 fields;
|
||||
__u8 gen;
|
||||
__u8 data[];
|
||||
} __attribute__((packed, aligned(8)));
|
||||
BKEY_VAL_TYPE(alloc, BCH_ALLOC);
|
||||
|
||||
/* Superblock */
|
||||
|
||||
/* Version 0: Cache device
|
||||
@ -752,8 +771,7 @@ struct bch_member {
|
||||
|
||||
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
|
||||
LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8)
|
||||
LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9)
|
||||
LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10)
|
||||
/* 8-10 unused, was HAS_(META)DATA */
|
||||
LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
|
||||
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15);
|
||||
|
||||
@ -800,7 +818,8 @@ enum bch_sb_field_type {
|
||||
BCH_SB_FIELD_journal = 0,
|
||||
BCH_SB_FIELD_members = 1,
|
||||
BCH_SB_FIELD_crypt = 2,
|
||||
BCH_SB_FIELD_NR = 3,
|
||||
BCH_SB_FIELD_replicas = 3,
|
||||
BCH_SB_FIELD_NR = 4,
|
||||
};
|
||||
|
||||
struct bch_sb_field_journal {
|
||||
@ -861,8 +880,24 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
|
||||
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
|
||||
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
|
||||
|
||||
struct bch_sb_field_replication {
|
||||
enum bch_data_types {
|
||||
BCH_DATA_NONE = 0,
|
||||
BCH_DATA_SB = 1,
|
||||
BCH_DATA_JOURNAL = 2,
|
||||
BCH_DATA_BTREE = 3,
|
||||
BCH_DATA_USER = 4,
|
||||
BCH_DATA_NR = 5,
|
||||
};
|
||||
|
||||
struct bch_replicas_entry {
|
||||
u8 data_type;
|
||||
u8 nr;
|
||||
u8 devs[0];
|
||||
};
|
||||
|
||||
struct bch_sb_field_replicas {
|
||||
struct bch_sb_field field;
|
||||
struct bch_replicas_entry entries[0];
|
||||
};
|
||||
|
||||
/*
|
||||
@ -937,8 +972,7 @@ LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
|
||||
LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
|
||||
LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
|
||||
|
||||
LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60);
|
||||
LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64);
|
||||
/* 56-64 unused, was REPLICAS_HAVE */
|
||||
|
||||
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
|
||||
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8);
|
||||
@ -946,6 +980,7 @@ LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
|
||||
|
||||
LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
|
||||
LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
|
||||
|
||||
/* 14-20 unused, was JOURNAL_ENTRY_SIZE */
|
||||
|
||||
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
|
||||
@ -1003,77 +1038,6 @@ enum bch_compression_opts {
|
||||
BCH_COMPRESSION_NR = 3,
|
||||
};
|
||||
|
||||
/* backing device specific stuff: */
|
||||
|
||||
struct backingdev_sb {
|
||||
__le64 csum;
|
||||
__le64 offset; /* sector where this sb was written */
|
||||
__le64 version; /* of on disk format */
|
||||
|
||||
uuid_le magic; /* bcachefs superblock UUID */
|
||||
|
||||
uuid_le disk_uuid;
|
||||
|
||||
/*
|
||||
* Internal cache set UUID - xored with various magic numbers and thus
|
||||
* must never change:
|
||||
*/
|
||||
union {
|
||||
uuid_le set_uuid;
|
||||
__le64 set_magic;
|
||||
};
|
||||
__u8 label[BCH_SB_LABEL_SIZE];
|
||||
|
||||
__le64 flags;
|
||||
|
||||
/* Incremented each time superblock is written: */
|
||||
__le64 seq;
|
||||
|
||||
/*
|
||||
* User visible UUID for identifying the cache set the user is allowed
|
||||
* to change:
|
||||
*
|
||||
* XXX hooked up?
|
||||
*/
|
||||
uuid_le user_uuid;
|
||||
__le64 pad1[6];
|
||||
|
||||
__le64 data_offset;
|
||||
__le16 block_size; /* sectors */
|
||||
__le16 pad2[3];
|
||||
|
||||
__le32 last_mount; /* time_t */
|
||||
__le16 pad3;
|
||||
/* size of variable length portion - always 0 for backingdev superblock */
|
||||
__le16 u64s;
|
||||
__u64 _data[0];
|
||||
};
|
||||
|
||||
LE64_BITMASK(BDEV_CACHE_MODE, struct backingdev_sb, flags, 0, 4);
|
||||
#define CACHE_MODE_WRITETHROUGH 0U
|
||||
#define CACHE_MODE_WRITEBACK 1U
|
||||
#define CACHE_MODE_WRITEAROUND 2U
|
||||
#define CACHE_MODE_NONE 3U
|
||||
|
||||
LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63);
|
||||
#define BDEV_STATE_NONE 0U
|
||||
#define BDEV_STATE_CLEAN 1U
|
||||
#define BDEV_STATE_DIRTY 2U
|
||||
#define BDEV_STATE_STALE 3U
|
||||
|
||||
#define BDEV_DATA_START_DEFAULT 16 /* sectors */
|
||||
|
||||
static inline _Bool __SB_IS_BDEV(__u64 version)
|
||||
{
|
||||
return version == BCACHE_SB_VERSION_BDEV
|
||||
|| version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
|
||||
}
|
||||
|
||||
static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
|
||||
{
|
||||
return __SB_IS_BDEV(sb->version);
|
||||
}
|
||||
|
||||
/*
|
||||
* Magic numbers
|
||||
*
|
||||
@ -1088,7 +1052,6 @@ static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
|
||||
#define BCACHE_STATFS_MAGIC 0xca451a4e
|
||||
|
||||
#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
|
||||
#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
|
||||
#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
|
||||
|
||||
static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
|
||||
@ -1103,11 +1066,6 @@ static inline __u64 __jset_magic(struct bch_sb *sb)
|
||||
return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
|
||||
}
|
||||
|
||||
static inline __u64 __pset_magic(struct bch_sb *sb)
|
||||
{
|
||||
return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
|
||||
}
|
||||
|
||||
static inline __u64 __bset_magic(struct bch_sb *sb)
|
||||
{
|
||||
return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
|
||||
@ -1136,9 +1094,9 @@ struct jset_entry {
|
||||
|
||||
LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8);
|
||||
enum {
|
||||
JOURNAL_ENTRY_BTREE_KEYS = 0,
|
||||
JOURNAL_ENTRY_BTREE_ROOT = 1,
|
||||
JOURNAL_ENTRY_PRIO_PTRS = 2,
|
||||
JOURNAL_ENTRY_BTREE_KEYS = 0,
|
||||
JOURNAL_ENTRY_BTREE_ROOT = 1,
|
||||
JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */
|
||||
|
||||
/*
|
||||
* Journal sequence numbers can be blacklisted: bsets record the max
|
||||
@ -1150,7 +1108,7 @@ enum {
|
||||
* and then record that we skipped it so that the next time we crash and
|
||||
* recover we don't think there was a missing journal entry.
|
||||
*/
|
||||
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
|
||||
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -1193,35 +1151,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
|
||||
|
||||
#define BCH_JOURNAL_BUCKETS_MIN 20
|
||||
|
||||
/* Bucket prios/gens */
|
||||
|
||||
struct prio_set {
|
||||
struct bch_csum csum;
|
||||
|
||||
__le64 magic;
|
||||
__le32 nonce[3];
|
||||
__le16 version;
|
||||
__le16 flags;
|
||||
|
||||
__u8 encrypted_start[0];
|
||||
|
||||
__le64 next_bucket;
|
||||
|
||||
struct bucket_disk {
|
||||
__le16 prio[2];
|
||||
__u8 gen;
|
||||
} __attribute__((packed)) data[];
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
|
||||
|
||||
/* Btree: */
|
||||
|
||||
#define DEFINE_BCH_BTREE_IDS() \
|
||||
DEF_BTREE_ID(EXTENTS, 0, "extents") \
|
||||
DEF_BTREE_ID(INODES, 1, "inodes") \
|
||||
DEF_BTREE_ID(DIRENTS, 2, "dirents") \
|
||||
DEF_BTREE_ID(XATTRS, 3, "xattrs")
|
||||
DEF_BTREE_ID(EXTENTS, 0, "extents") \
|
||||
DEF_BTREE_ID(INODES, 1, "inodes") \
|
||||
DEF_BTREE_ID(DIRENTS, 2, "dirents") \
|
||||
DEF_BTREE_ID(XATTRS, 3, "xattrs") \
|
||||
DEF_BTREE_ID(ALLOC, 4, "alloc")
|
||||
|
||||
#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
|
||||
|
||||
@ -1318,4 +1255,33 @@ struct btree_node_entry {
|
||||
};
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
/* Obsolete: */
|
||||
|
||||
struct prio_set {
|
||||
struct bch_csum csum;
|
||||
|
||||
__le64 magic;
|
||||
__le32 nonce[3];
|
||||
__le16 version;
|
||||
__le16 flags;
|
||||
|
||||
__u8 encrypted_start[0];
|
||||
|
||||
__le64 next_bucket;
|
||||
|
||||
struct bucket_disk {
|
||||
__le16 prio[2];
|
||||
__u8 gen;
|
||||
} __attribute__((packed)) data[];
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
|
||||
|
||||
#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
|
||||
|
||||
static inline __u64 __pset_magic(struct bch_sb *sb)
|
||||
{
|
||||
return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_FORMAT_H */
|
||||
|
@ -1,13 +1,9 @@
|
||||
#ifndef _LINUX_BCACHE_IOCTL_H
|
||||
#define _LINUX_BCACHE_IOCTL_H
|
||||
#ifndef _BCACHEFS_IOCTL_H
|
||||
#define _BCACHEFS_IOCTL_H
|
||||
|
||||
#include <linux/uuid.h>
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
|
||||
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
|
||||
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
|
||||
@ -97,8 +93,4 @@ struct bch_ioctl_data {
|
||||
__u64 end_offset;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_BCACHE_IOCTL_H */
|
||||
#endif /* _BCACHEFS_IOCTL_H */
|
||||
|
@ -580,6 +580,8 @@ BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT);
|
||||
|
||||
BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
|
||||
|
||||
BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
|
||||
|
||||
/* byte order helpers */
|
||||
|
||||
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "bcachefs.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_types.h"
|
||||
#include "alloc.h"
|
||||
#include "dirent.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
@ -13,6 +14,7 @@ const struct bkey_ops *bch2_bkey_ops[] = {
|
||||
[BKEY_TYPE_INODES] = &bch2_bkey_inode_ops,
|
||||
[BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops,
|
||||
[BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops,
|
||||
[BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops,
|
||||
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
|
||||
};
|
||||
|
||||
|
@ -129,6 +129,8 @@ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
|
||||
int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
enum bch_data_types data_type = type == BKEY_TYPE_BTREE
|
||||
? BCH_DATA_BTREE : BCH_DATA_USER;
|
||||
int ret = 0;
|
||||
|
||||
switch (k.k->type) {
|
||||
@ -137,6 +139,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
|
||||
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
|
||||
(!c->opts.nofsck &&
|
||||
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
|
||||
"superblock not marked as containing replicas"))) {
|
||||
ret = bch2_check_mark_super(c, e, data_type);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
extent_for_each_ptr(e, ptr) {
|
||||
struct bch_dev *ca = c->devs[ptr->dev];
|
||||
struct bucket *g = PTR_BUCKET(ca, ptr);
|
||||
@ -147,7 +158,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
new.gen = ptr->gen;
|
||||
new.gen_valid = 1;
|
||||
}));
|
||||
ca->need_prio_write = true;
|
||||
ca->need_alloc_write = true;
|
||||
}
|
||||
|
||||
if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
|
||||
@ -159,7 +170,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
new.gen = ptr->gen;
|
||||
new.gen_valid = 1;
|
||||
}));
|
||||
ca->need_prio_write = true;
|
||||
ca->need_alloc_write = true;
|
||||
set_bit(BCH_FS_FIXED_GENS, &c->flags);
|
||||
}
|
||||
|
||||
@ -168,6 +179,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
atomic64_set(&c->key_version,
|
||||
max_t(u64, k.k->version.lo,
|
||||
atomic64_read(&c->key_version)));
|
||||
@ -348,17 +360,6 @@ void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
|
||||
}
|
||||
|
||||
spin_unlock(&c->journal.lock);
|
||||
|
||||
spin_lock(&ca->prio_buckets_lock);
|
||||
|
||||
for (i = 0; i < prio_buckets(ca) * 2; i++) {
|
||||
b = ca->prio_buckets[i];
|
||||
if (b)
|
||||
bch2_mark_metadata_bucket(ca, ca->buckets + b,
|
||||
BUCKET_PRIOS, true);
|
||||
}
|
||||
|
||||
spin_unlock(&ca->prio_buckets_lock);
|
||||
}
|
||||
|
||||
static void bch2_mark_metadata(struct bch_fs *c)
|
||||
@ -474,10 +475,6 @@ void bch2_gc(struct bch_fs *c)
|
||||
* move around - if references move backwards in the ordering GC
|
||||
* uses, GC could skip past them
|
||||
*/
|
||||
|
||||
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
|
||||
return;
|
||||
|
||||
trace_gc_start(c);
|
||||
|
||||
/*
|
||||
@ -487,6 +484,8 @@ void bch2_gc(struct bch_fs *c)
|
||||
bch2_recalc_sectors_available(c);
|
||||
|
||||
down_write(&c->gc_lock);
|
||||
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
|
||||
goto out;
|
||||
|
||||
bch2_gc_start(c);
|
||||
|
||||
@ -502,8 +501,7 @@ void bch2_gc(struct bch_fs *c)
|
||||
if (ret) {
|
||||
bch_err(c, "btree gc failed: %d", ret);
|
||||
set_bit(BCH_FS_GC_FAILURE, &c->flags);
|
||||
up_write(&c->gc_lock);
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
|
||||
@ -518,7 +516,7 @@ void bch2_gc(struct bch_fs *c)
|
||||
/* Indicates that gc is no longer in progress: */
|
||||
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
|
||||
c->gc_count++;
|
||||
|
||||
out:
|
||||
up_write(&c->gc_lock);
|
||||
trace_gc_end(c);
|
||||
bch2_time_stats_update(&c->btree_gc_time, start_time);
|
||||
@ -529,6 +527,12 @@ void bch2_gc(struct bch_fs *c)
|
||||
*/
|
||||
for_each_member_device(ca, c, i)
|
||||
bch2_wake_allocator(ca);
|
||||
|
||||
/*
|
||||
* At startup, allocations can happen directly instead of via the
|
||||
* allocator thread - issue wakeup in case they blocked on gc_lock:
|
||||
*/
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
}
|
||||
|
||||
/* Btree coalescing */
|
||||
@ -997,6 +1001,14 @@ int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
|
||||
unsigned iter = 0;
|
||||
enum btree_id id;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
if (!bch2_sb_get_replicas(c->disk_sb)) {
|
||||
if (BCH_SB_INITIALIZED(c->disk_sb))
|
||||
bch_info(c, "building replicas info");
|
||||
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
again:
|
||||
bch2_gc_start(c);
|
||||
|
||||
@ -1006,11 +1018,9 @@ again:
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (journal) {
|
||||
ret = bch2_journal_mark(c, journal);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
ret = bch2_journal_mark(c, journal);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_mark_metadata(c);
|
||||
|
||||
|
@ -1402,7 +1402,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
|
||||
|
||||
ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
|
||||
if (ret)
|
||||
bch2_fatal_error(c);
|
||||
bch2_inconsistent_error(c);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -233,17 +233,29 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
|
||||
}
|
||||
|
||||
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
||||
bool use_reserve,
|
||||
struct disk_reservation *res,
|
||||
struct closure *cl)
|
||||
struct disk_reservation *res,
|
||||
struct closure *cl,
|
||||
unsigned flags)
|
||||
{
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct open_bucket *ob;
|
||||
struct btree *b;
|
||||
unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
|
||||
unsigned nr_reserve;
|
||||
enum alloc_reserve alloc_reserve;
|
||||
|
||||
if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
|
||||
nr_reserve = 0;
|
||||
alloc_reserve = RESERVE_ALLOC;
|
||||
} else if (flags & BTREE_INSERT_USE_RESERVE) {
|
||||
nr_reserve = BTREE_NODE_RESERVE / 2;
|
||||
alloc_reserve = RESERVE_BTREE;
|
||||
} else {
|
||||
nr_reserve = BTREE_NODE_RESERVE;
|
||||
alloc_reserve = RESERVE_NONE;
|
||||
}
|
||||
|
||||
mutex_lock(&c->btree_reserve_cache_lock);
|
||||
if (c->btree_reserve_cache_nr > reserve) {
|
||||
if (c->btree_reserve_cache_nr > nr_reserve) {
|
||||
struct btree_alloc *a =
|
||||
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
|
||||
|
||||
@ -263,8 +275,7 @@ retry:
|
||||
bkey_i_to_extent(&tmp.k),
|
||||
res->nr_replicas,
|
||||
c->opts.metadata_replicas_required,
|
||||
use_reserve ? RESERVE_BTREE : RESERVE_NONE,
|
||||
cl);
|
||||
alloc_reserve, cl);
|
||||
if (IS_ERR(ob))
|
||||
return ERR_CAST(ob);
|
||||
|
||||
@ -311,7 +322,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
|
||||
|
||||
bch2_btree_build_aux_trees(b);
|
||||
|
||||
bch2_check_mark_super(c, &b->key, true);
|
||||
bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE);
|
||||
|
||||
trace_btree_node_alloc(c, b);
|
||||
return b;
|
||||
@ -533,9 +544,6 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
|
||||
if (flags & BTREE_INSERT_NOFAIL)
|
||||
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
|
||||
|
||||
if (flags & BTREE_INSERT_NOWAIT)
|
||||
cl = NULL;
|
||||
|
||||
/*
|
||||
* This check isn't necessary for correctness - it's just to potentially
|
||||
* prevent us from doing a lot of work that'll end up being wasted:
|
||||
@ -565,8 +573,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
|
||||
reserve->nr = 0;
|
||||
|
||||
while (reserve->nr < nr_nodes) {
|
||||
b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
|
||||
&disk_res, cl);
|
||||
b = __bch2_btree_node_alloc(c, &disk_res,
|
||||
flags & BTREE_INSERT_NOWAIT
|
||||
? NULL : cl, flags);
|
||||
if (IS_ERR(b)) {
|
||||
ret = PTR_ERR(b);
|
||||
goto err_free;
|
||||
@ -793,8 +802,8 @@ void bch2_btree_journal_key(struct btree_insert *trans,
|
||||
struct btree_write *w = btree_current_write(b);
|
||||
|
||||
EBUG_ON(iter->level || b->level);
|
||||
EBUG_ON(!trans->journal_res.ref &&
|
||||
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
|
||||
EBUG_ON(trans->journal_res.ref !=
|
||||
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
|
||||
|
||||
if (!journal_pin_active(&w->journal))
|
||||
bch2_journal_pin_add(j, &trans->journal_res,
|
||||
@ -1026,6 +1035,27 @@ retry:
|
||||
*/
|
||||
six_unlock_read(&b->lock);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* Bit of funny circularity going on here we have to break:
|
||||
*
|
||||
* We have to drop our journal pin before writing the journal
|
||||
* entry that points to the new btree root: else, we could
|
||||
* deadlock if the journal currently happens to be full.
|
||||
*
|
||||
* This mean we're dropping the journal pin _before_ the new
|
||||
* nodes are technically reachable - but this is safe, because
|
||||
* after the bch2_btree_set_root_ondisk() call above they will
|
||||
* be reachable as of the very next journal write:
|
||||
*/
|
||||
bch2_journal_pin_drop(&c->journal, &as->journal);
|
||||
|
||||
/*
|
||||
* And, do a journal write to write the pointer to the new root,
|
||||
* then wait for it to complete before freeing the nodes we
|
||||
* replaced:
|
||||
*/
|
||||
bch2_journal_meta_async(&c->journal, cl);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1051,19 +1081,70 @@ static void btree_interior_update_updated_btree(struct bch_fs *c,
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* In general, when you're staging things in a journal that will later
|
||||
* be written elsewhere, and you also want to guarantee ordering: that
|
||||
* is, if you have updates a, b, c, after a crash you should never see c
|
||||
* and not a or b - there's a problem:
|
||||
*
|
||||
* If the final destination of the update(s) (i.e. btree node) can be
|
||||
* written/flushed _before_ the relevant journal entry - oops, that
|
||||
* breaks ordering, since the various leaf nodes can be written in any
|
||||
* order.
|
||||
*
|
||||
* Normally we use bset->journal_seq to deal with this - if during
|
||||
* recovery we find a btree node write that's newer than the newest
|
||||
* journal entry, we just ignore it - we don't need it, anything we're
|
||||
* supposed to have (that we reported as completed via fsync()) will
|
||||
* still be in the journal, and as far as the state of the journal is
|
||||
* concerned that btree node write never happened.
|
||||
*
|
||||
* That breaks when we're rewriting/splitting/merging nodes, since we're
|
||||
* mixing btree node writes that haven't happened yet with previously
|
||||
* written data that has been reported as completed to the journal.
|
||||
*
|
||||
* Thus, before making the new nodes reachable, we have to wait the
|
||||
* newest journal sequence number we have data for to be written (if it
|
||||
* hasn't been yet).
|
||||
*/
|
||||
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
|
||||
|
||||
continue_at(&as->cl, btree_interior_update_nodes_written,
|
||||
system_freezable_wq);
|
||||
}
|
||||
|
||||
static void btree_interior_update_reparent(struct btree_interior_update *as,
|
||||
static void interior_update_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
struct btree_interior_update *as =
|
||||
container_of(pin, struct btree_interior_update, journal);
|
||||
|
||||
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
|
||||
}
|
||||
|
||||
static void btree_interior_update_reparent(struct bch_fs *c,
|
||||
struct btree_interior_update *as,
|
||||
struct btree_interior_update *child)
|
||||
{
|
||||
child->b = NULL;
|
||||
child->mode = BTREE_INTERIOR_UPDATING_AS;
|
||||
child->parent_as = as;
|
||||
closure_get(&as->cl);
|
||||
|
||||
/*
|
||||
* When we write a new btree root, we have to drop our journal pin
|
||||
* _before_ the new nodes are technically reachable; see
|
||||
* btree_interior_update_nodes_written().
|
||||
*
|
||||
* This goes for journal pins that are recursively blocked on us - so,
|
||||
* just transfer the journal pin to the new interior update so
|
||||
* btree_interior_update_nodes_written() can drop it.
|
||||
*/
|
||||
bch2_journal_pin_add_if_older(&c->journal, &child->journal,
|
||||
&as->journal, interior_update_flush);
|
||||
bch2_journal_pin_drop(&c->journal, &child->journal);
|
||||
|
||||
as->journal_seq = max(as->journal_seq, child->journal_seq);
|
||||
}
|
||||
|
||||
static void btree_interior_update_updated_root(struct bch_fs *c,
|
||||
@ -1081,7 +1162,7 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
|
||||
* btree_interior_update operation to point to us:
|
||||
*/
|
||||
if (r->as)
|
||||
btree_interior_update_reparent(as, r->as);
|
||||
btree_interior_update_reparent(c, as, r->as);
|
||||
|
||||
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
|
||||
as->b = r->b;
|
||||
@ -1089,19 +1170,21 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* When we're rewriting nodes and updating interior nodes, there's an
|
||||
* issue with updates that haven't been written in the journal getting
|
||||
* mixed together with older data - see * btree_interior_update_updated_btree()
|
||||
* for the explanation.
|
||||
*
|
||||
* However, this doesn't affect us when we're writing a new btree root -
|
||||
* because to make that new root reachable we have to write out a new
|
||||
* journal entry, which must necessarily be newer than as->journal_seq.
|
||||
*/
|
||||
|
||||
continue_at(&as->cl, btree_interior_update_nodes_written,
|
||||
system_freezable_wq);
|
||||
}
|
||||
|
||||
static void interior_update_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
struct btree_interior_update *as =
|
||||
container_of(pin, struct btree_interior_update, journal);
|
||||
|
||||
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* @b is being split/rewritten: it may have pointers to not-yet-written btree
|
||||
* nodes and thus outstanding btree_interior_updates - redirect @b's
|
||||
@ -1150,7 +1233,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
|
||||
*/
|
||||
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
|
||||
list_del(&p->write_blocked_list);
|
||||
btree_interior_update_reparent(as, p);
|
||||
btree_interior_update_reparent(c, as, p);
|
||||
}
|
||||
|
||||
clear_btree_node_dirty(b);
|
||||
|
@ -373,16 +373,20 @@ int __bch2_btree_insert_at(struct btree_insert *);
|
||||
|
||||
/* for copygc, or when merging btree nodes */
|
||||
#define BTREE_INSERT_USE_RESERVE (1 << 2)
|
||||
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
|
||||
|
||||
/*
|
||||
* Insert is for journal replay: don't get journal reservations, or mark extents
|
||||
* (bch_mark_key)
|
||||
*/
|
||||
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
|
||||
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4)
|
||||
|
||||
/* Don't block on allocation failure (for new btree nodes: */
|
||||
#define BTREE_INSERT_NOWAIT (1 << 4)
|
||||
#define BTREE_INSERT_GC_LOCK_HELD (1 << 5)
|
||||
#define BTREE_INSERT_NOWAIT (1 << 5)
|
||||
#define BTREE_INSERT_GC_LOCK_HELD (1 << 6)
|
||||
|
||||
#define BCH_HASH_SET_MUST_CREATE (1 << 7)
|
||||
#define BCH_HASH_SET_MUST_REPLACE (1 << 8)
|
||||
|
||||
int bch2_btree_delete_at(struct btree_iter *, unsigned);
|
||||
|
||||
|
@ -306,14 +306,18 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
|
||||
_old; \
|
||||
})
|
||||
|
||||
void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
|
||||
bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
struct bucket_mark *old)
|
||||
{
|
||||
struct bch_fs_usage stats = { 0 };
|
||||
struct bucket_mark old, new;
|
||||
struct bucket_mark new;
|
||||
|
||||
*old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
if (!is_available_bucket(new))
|
||||
return false;
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
new.owned_by_allocator = 1;
|
||||
new.had_metadata = 0;
|
||||
new.touched_this_mount = 1;
|
||||
new.data_type = 0;
|
||||
new.cached_sectors = 0;
|
||||
new.dirty_sectors = 0;
|
||||
@ -321,11 +325,28 @@ void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
|
||||
}));
|
||||
|
||||
/* XXX: we're not actually updating fs usage's cached sectors... */
|
||||
bch2_fs_usage_update(&stats, old, new);
|
||||
bch2_fs_usage_update(&stats, *old, new);
|
||||
|
||||
if (!old.owned_by_allocator && old.cached_sectors)
|
||||
if (!old->owned_by_allocator && old->cached_sectors)
|
||||
trace_invalidate(ca, g - ca->buckets,
|
||||
old.cached_sectors);
|
||||
old->cached_sectors);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
|
||||
{
|
||||
struct bucket_mark new, old;
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
if (new.touched_this_mount ||
|
||||
!is_available_bucket(new))
|
||||
return false;
|
||||
|
||||
new.owned_by_allocator = 1;
|
||||
new.touched_this_mount = 1;
|
||||
}));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
|
||||
@ -333,6 +354,7 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
|
||||
struct bucket_mark old, new;
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
new.touched_this_mount = 1;
|
||||
new.owned_by_allocator = 0;
|
||||
new.data_type = 0;
|
||||
new.cached_sectors = 0;
|
||||
@ -348,7 +370,8 @@ void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
struct bucket_mark new;
|
||||
|
||||
bucket_data_cmpxchg(ca, g, new, ({
|
||||
new.owned_by_allocator = owned_by_allocator;
|
||||
new.touched_this_mount = 1;
|
||||
new.owned_by_allocator = owned_by_allocator;
|
||||
}));
|
||||
}
|
||||
|
||||
@ -376,8 +399,8 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
|
||||
GC_MAX_SECTORS_USED);
|
||||
new.data_type = type;
|
||||
new.had_metadata = 1;
|
||||
new.data_type = type;
|
||||
new.touched_this_mount = 1;
|
||||
}));
|
||||
|
||||
if (old.data_type != type &&
|
||||
@ -458,8 +481,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
if (gc_will_visit) {
|
||||
if (journal_seq)
|
||||
bucket_cmpxchg(g, new, ({
|
||||
new.journal_seq_valid = 1;
|
||||
new.journal_seq = journal_seq;
|
||||
new.touched_this_mount = 1;
|
||||
new.journal_seq_valid = 1;
|
||||
new.journal_seq = journal_seq;
|
||||
}));
|
||||
|
||||
goto out;
|
||||
@ -479,11 +503,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
return;
|
||||
}
|
||||
|
||||
EBUG_ON(type != S_CACHED &&
|
||||
!may_make_unavailable &&
|
||||
is_available_bucket(new) &&
|
||||
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
|
||||
|
||||
if (type != S_CACHED &&
|
||||
new.dirty_sectors == GC_MAX_SECTORS_USED &&
|
||||
disk_sectors < 0)
|
||||
@ -508,7 +527,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
new.data_type = data_type;
|
||||
}
|
||||
|
||||
new.had_metadata |= is_meta_bucket(new);
|
||||
new.touched_this_mount = 1;
|
||||
}));
|
||||
|
||||
if (old.data_type != data_type &&
|
||||
|
@ -191,7 +191,9 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
|
||||
|
||||
void bch2_bucket_seq_cleanup(struct bch_fs *);
|
||||
|
||||
void bch2_invalidate_bucket(struct bch_dev *, struct bucket *);
|
||||
bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
|
||||
struct bucket_mark *);
|
||||
bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
|
||||
void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
|
||||
void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
|
||||
void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include "util.h"
|
||||
|
||||
/* kill, switch to bch_data_types */
|
||||
enum bucket_data_type {
|
||||
BUCKET_DATA = 0,
|
||||
BUCKET_BTREE,
|
||||
@ -19,23 +20,12 @@ struct bucket_mark {
|
||||
|
||||
struct {
|
||||
u8 gen;
|
||||
|
||||
unsigned gen_valid:1;
|
||||
unsigned journal_seq_valid:1;
|
||||
|
||||
/*
|
||||
* If this bucket had metadata while at the current generation
|
||||
* number, the allocator must increment its gen before we reuse
|
||||
* it:
|
||||
*/
|
||||
unsigned had_metadata:1;
|
||||
|
||||
unsigned owned_by_allocator:1;
|
||||
|
||||
unsigned data_type:3;
|
||||
|
||||
unsigned nouse:1;
|
||||
|
||||
u8 data_type:3,
|
||||
gen_valid:1,
|
||||
owned_by_allocator:1,
|
||||
nouse:1,
|
||||
journal_seq_valid:1,
|
||||
touched_this_mount:1;
|
||||
u16 dirty_sectors;
|
||||
u16 cached_sectors;
|
||||
|
||||
|
@ -412,9 +412,6 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
|
||||
size_ondisk > ca->mi.bucket_size)
|
||||
return "spans multiple buckets";
|
||||
|
||||
if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
|
||||
return "device not marked as containing data";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -547,12 +544,12 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (replicas < c->sb.meta_replicas_have) {
|
||||
if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
|
||||
bch2_bkey_val_to_text(c, btree_node_type(b),
|
||||
buf, sizeof(buf), k);
|
||||
bch2_fs_bug(c,
|
||||
"btree key bad (too few replicas, %u < %u): %s",
|
||||
replicas, c->sb.meta_replicas_have, buf);
|
||||
"btree key bad (replicas not marked in superblock):\n%s",
|
||||
buf);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1755,12 +1752,12 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
|
||||
}
|
||||
|
||||
if (!bkey_extent_is_cached(e.k) &&
|
||||
replicas < c->sb.data_replicas_have) {
|
||||
bch2_bkey_val_to_text(c, btree_node_type(b), buf,
|
||||
sizeof(buf), e.s_c);
|
||||
!bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
|
||||
bch2_bkey_val_to_text(c, btree_node_type(b),
|
||||
buf, sizeof(buf), e.s_c);
|
||||
bch2_fs_bug(c,
|
||||
"extent key bad (too few replicas, %u < %u): %s",
|
||||
replicas, c->sb.data_replicas_have, buf);
|
||||
"extent key bad (replicas not marked in superblock):\n%s",
|
||||
buf);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -531,7 +531,8 @@ static int bch2_write_extent(struct bch_write_op *op,
|
||||
|
||||
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
|
||||
|
||||
bch2_check_mark_super(c, key_to_write, false);
|
||||
bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
|
||||
BCH_DATA_USER);
|
||||
|
||||
bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
|
||||
return ret;
|
||||
|
@ -53,28 +53,6 @@ static inline u64 journal_pin_seq(struct journal *j,
|
||||
return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
|
||||
}
|
||||
|
||||
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
||||
struct jset_entry *entry, unsigned type)
|
||||
{
|
||||
while (entry < vstruct_last(jset)) {
|
||||
if (JOURNAL_ENTRY_TYPE(entry) == type)
|
||||
return entry;
|
||||
|
||||
entry = vstruct_next(entry);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define for_each_jset_entry_type(entry, jset, type) \
|
||||
for (entry = (jset)->start; \
|
||||
(entry = __jset_entry_type_next(jset, entry, type)); \
|
||||
entry = vstruct_next(entry))
|
||||
|
||||
#define for_each_jset_key(k, _n, entry, jset) \
|
||||
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
|
||||
vstruct_for_each_safe(entry, k, _n)
|
||||
|
||||
static inline void bch2_journal_add_entry(struct journal_buf *buf,
|
||||
const void *data, size_t u64s,
|
||||
unsigned type, enum btree_id id,
|
||||
@ -123,20 +101,6 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
|
||||
JOURNAL_ENTRY_BTREE_ROOT, id, level);
|
||||
}
|
||||
|
||||
static inline void bch2_journal_add_prios(struct journal *j,
|
||||
struct journal_buf *buf)
|
||||
{
|
||||
/*
|
||||
* no prio bucket ptrs yet... XXX should change the allocator so this
|
||||
* can't happen:
|
||||
*/
|
||||
if (!buf->nr_prio_buckets)
|
||||
return;
|
||||
|
||||
bch2_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
|
||||
JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
|
||||
}
|
||||
|
||||
static void journal_seq_blacklist_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
@ -986,7 +950,6 @@ static inline bool journal_has_keys(struct list_head *list)
|
||||
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
{
|
||||
struct journal *j = &c->journal;
|
||||
struct jset_entry *prio_ptrs;
|
||||
struct journal_list jlist;
|
||||
struct journal_replay *i;
|
||||
struct journal_entry_pin_list *p;
|
||||
@ -1094,15 +1057,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
|
||||
bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
|
||||
keys, entries, (u64) atomic64_read(&j->seq));
|
||||
|
||||
i = list_last_entry(list, struct journal_replay, list);
|
||||
prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
|
||||
if (prio_ptrs) {
|
||||
memcpy_u64s(j->prio_buckets,
|
||||
prio_ptrs->_data,
|
||||
le16_to_cpu(prio_ptrs->u64s));
|
||||
j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
|
||||
}
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -1189,12 +1143,7 @@ static void __bch2_journal_next_entry(struct journal *j)
|
||||
|
||||
static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
|
||||
{
|
||||
unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
|
||||
|
||||
if (buf->nr_prio_buckets)
|
||||
ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
|
||||
|
||||
return ret;
|
||||
return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
|
||||
}
|
||||
|
||||
static enum {
|
||||
@ -1395,9 +1344,7 @@ static int journal_entry_open(struct journal *j)
|
||||
buf->disk_sectors = sectors;
|
||||
|
||||
sectors = min_t(unsigned, sectors, buf->size >> 9);
|
||||
|
||||
j->cur_buf_sectors = sectors;
|
||||
buf->nr_prio_buckets = j->nr_prio_buckets;
|
||||
|
||||
u64s = (sectors << 9) / sizeof(u64);
|
||||
|
||||
@ -1510,17 +1457,27 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
|
||||
for_each_jset_key(k, _n, entry, &i->j) {
|
||||
struct disk_reservation disk_res;
|
||||
|
||||
/*
|
||||
* We might cause compressed extents to be split, so we
|
||||
* need to pass in a disk_reservation:
|
||||
*/
|
||||
BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
|
||||
if (entry->btree_id == BTREE_ID_ALLOC) {
|
||||
/*
|
||||
* allocation code handles replay for
|
||||
* BTREE_ID_ALLOC keys:
|
||||
*/
|
||||
ret = bch2_alloc_replay_key(c, k->k.p);
|
||||
} else {
|
||||
|
||||
ret = bch2_btree_insert(c, entry->btree_id, k,
|
||||
&disk_res, NULL, NULL,
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_JOURNAL_REPLAY);
|
||||
bch2_disk_reservation_put(c, &disk_res);
|
||||
/*
|
||||
* We might cause compressed extents to be
|
||||
* split, so we need to pass in a
|
||||
* disk_reservation:
|
||||
*/
|
||||
BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
|
||||
|
||||
ret = bch2_btree_insert(c, entry->btree_id, k,
|
||||
&disk_res, NULL, NULL,
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_JOURNAL_REPLAY);
|
||||
bch2_disk_reservation_put(c, &disk_res);
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
bch_err(c, "journal replay: error %d while replaying key",
|
||||
@ -1560,13 +1517,12 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Allocate more journal space at runtime - not currently making use if it, but
|
||||
* the code works:
|
||||
*/
|
||||
static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
unsigned nr)
|
||||
unsigned nr)
|
||||
{
|
||||
struct journal *j = &c->journal;
|
||||
struct journal_device *ja = &ca->journal;
|
||||
@ -1614,8 +1570,8 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
while (ja->nr < nr) {
|
||||
/* must happen under journal lock, to avoid racing with gc: */
|
||||
u64 b = bch2_bucket_alloc(ca, RESERVE_NONE);
|
||||
if (!b) {
|
||||
long b = bch2_bucket_alloc(c, ca, RESERVE_NONE);
|
||||
if (b < 0) {
|
||||
if (!closure_wait(&c->freelist_wait, &cl)) {
|
||||
spin_unlock(&j->lock);
|
||||
closure_sync(&cl);
|
||||
@ -1651,7 +1607,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
}
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
|
||||
BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
|
||||
|
||||
bch2_write_super(c);
|
||||
|
||||
@ -1663,16 +1619,15 @@ err:
|
||||
kfree(new_buckets);
|
||||
bch2_disk_reservation_put(c, &disk_res);
|
||||
|
||||
if (!ret)
|
||||
bch2_dev_allocator_add(c, ca);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
int bch2_dev_journal_alloc(struct bch_dev *ca)
|
||||
{
|
||||
struct journal_device *ja = &ca->journal;
|
||||
struct bch_sb_field_journal *journal_buckets;
|
||||
unsigned i, nr;
|
||||
u64 b, *p;
|
||||
unsigned nr;
|
||||
|
||||
if (dynamic_fault("bcachefs:add:journal_alloc"))
|
||||
return -ENOMEM;
|
||||
@ -1686,45 +1641,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
|
||||
min(1 << 10,
|
||||
(1 << 20) / ca->mi.bucket_size));
|
||||
|
||||
p = krealloc(ja->bucket_seq, nr * sizeof(u64),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
|
||||
ja->bucket_seq = p;
|
||||
|
||||
p = krealloc(ja->buckets, nr * sizeof(u64),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
|
||||
ja->buckets = p;
|
||||
|
||||
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
|
||||
nr + sizeof(*journal_buckets) / sizeof(u64));
|
||||
if (!journal_buckets)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0, b = ca->mi.first_bucket;
|
||||
i < nr && b < ca->mi.nbuckets; b++) {
|
||||
if (!is_available_bucket(ca->buckets[b].mark))
|
||||
continue;
|
||||
|
||||
bch2_mark_metadata_bucket(ca, &ca->buckets[b],
|
||||
BUCKET_JOURNAL, true);
|
||||
ja->buckets[i] = b;
|
||||
journal_buckets->buckets[i] = cpu_to_le64(b);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < nr)
|
||||
return -ENOSPC;
|
||||
|
||||
BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
|
||||
|
||||
ja->nr = nr;
|
||||
|
||||
return 0;
|
||||
return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
|
||||
}
|
||||
|
||||
/* Journalling */
|
||||
@ -2274,9 +2191,6 @@ static void journal_write(struct closure *cl)
|
||||
jset = w->data;
|
||||
|
||||
j->write_start_time = local_clock();
|
||||
|
||||
bch2_journal_add_prios(j, w);
|
||||
|
||||
mutex_lock(&c->btree_root_lock);
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
struct btree_root *r = &c->btree_roots[i];
|
||||
@ -2324,7 +2238,8 @@ static void journal_write(struct closure *cl)
|
||||
closure_return_with_destructor(cl, journal_write_done);
|
||||
}
|
||||
|
||||
bch2_check_mark_super(c, &j->key, true);
|
||||
bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
|
||||
BCH_DATA_JOURNAL);
|
||||
|
||||
/*
|
||||
* XXX: we really should just disable the entire journal in nochanges
|
||||
@ -2380,7 +2295,7 @@ no_io:
|
||||
|
||||
closure_return_with_destructor(cl, journal_write_done);
|
||||
err:
|
||||
bch2_fatal_error(c);
|
||||
bch2_inconsistent_error(c);
|
||||
closure_return_with_destructor(cl, journal_write_done);
|
||||
}
|
||||
|
||||
|
@ -121,6 +121,28 @@ struct journal_replay {
|
||||
struct jset j;
|
||||
};
|
||||
|
||||
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
||||
struct jset_entry *entry, unsigned type)
|
||||
{
|
||||
while (entry < vstruct_last(jset)) {
|
||||
if (JOURNAL_ENTRY_TYPE(entry) == type)
|
||||
return entry;
|
||||
|
||||
entry = vstruct_next(entry);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define for_each_jset_entry_type(entry, jset, type) \
|
||||
for (entry = (jset)->start; \
|
||||
(entry = __jset_entry_type_next(jset, entry, type)); \
|
||||
entry = vstruct_next(entry))
|
||||
|
||||
#define for_each_jset_key(k, _n, entry, jset) \
|
||||
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
|
||||
vstruct_for_each_safe(entry, k, _n)
|
||||
|
||||
#define JOURNAL_PIN (32 * 1024)
|
||||
|
||||
static inline bool journal_pin_active(struct journal_entry_pin *pin)
|
||||
|
@ -20,13 +20,6 @@ struct journal_buf {
|
||||
|
||||
unsigned size;
|
||||
unsigned disk_sectors;
|
||||
|
||||
/*
|
||||
* ugh, prio_buckets are stupid - need to convert them to new
|
||||
* transaction machinery when it arrives
|
||||
*/
|
||||
unsigned nr_prio_buckets;
|
||||
|
||||
/* bloom filter: */
|
||||
unsigned long has_inode[1024 / sizeof(unsigned long)];
|
||||
};
|
||||
@ -189,14 +182,6 @@ struct journal {
|
||||
|
||||
/* protects advancing ja->last_idx: */
|
||||
struct mutex reclaim_lock;
|
||||
|
||||
/*
|
||||
* ugh: need to get prio_buckets converted over to the eventual new
|
||||
* transaction machinery
|
||||
*/
|
||||
__le64 prio_buckets[BCH_SB_MEMBERS_MAX];
|
||||
unsigned nr_prio_buckets;
|
||||
|
||||
unsigned write_delay_ms;
|
||||
unsigned reclaim_delay_ms;
|
||||
|
||||
|
@ -59,16 +59,18 @@ int bch2_move_data_off_device(struct bch_dev *ca)
|
||||
{
|
||||
struct moving_context ctxt;
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_sb_field_members *mi;
|
||||
unsigned pass = 0;
|
||||
u64 seen_key_count;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
|
||||
|
||||
if (!ca->mi.has_data)
|
||||
if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
|
||||
|
||||
bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
ctxt.avoid = ca;
|
||||
|
||||
@ -124,7 +126,11 @@ int bch2_move_data_off_device(struct bch_dev *ca)
|
||||
BUG_ON(ret);
|
||||
|
||||
seen_key_count++;
|
||||
continue;
|
||||
next:
|
||||
if (bkey_extent_is_data(k.k))
|
||||
bch2_check_mark_super(c, bkey_s_c_to_extent(k),
|
||||
BCH_DATA_USER);
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
bch2_btree_iter_cond_resched(&iter);
|
||||
|
||||
@ -133,23 +139,20 @@ next:
|
||||
bch2_move_ctxt_exit(&ctxt);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
goto err;
|
||||
} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
|
||||
|
||||
if (seen_key_count) {
|
||||
pr_err("Unable to migrate all data in %d iterations.",
|
||||
MAX_DATA_OFF_ITER);
|
||||
return -1;
|
||||
ret = -1;
|
||||
goto err;
|
||||
}
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
mi = bch2_sb_get_members(c->disk_sb);
|
||||
SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -245,21 +248,27 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
|
||||
int bch2_move_metadata_off_device(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_sb_field_members *mi;
|
||||
unsigned i;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
|
||||
|
||||
if (!ca->mi.has_metadata)
|
||||
if (!(bch2_dev_has_data(c, ca) &
|
||||
((1 << BCH_DATA_JOURNAL)|
|
||||
(1 << BCH_DATA_BTREE))))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c,
|
||||
(1 << BCH_DATA_JOURNAL)|
|
||||
(1 << BCH_DATA_BTREE));
|
||||
|
||||
/* 1st, Move the btree nodes off the device */
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
ret = bch2_move_btree_off(c, ca, i);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* There are no prios/gens to move -- they are already in the device. */
|
||||
@ -268,16 +277,12 @@ int bch2_move_metadata_off_device(struct bch_dev *ca)
|
||||
|
||||
ret = bch2_journal_move(ca);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto err;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
mi = bch2_sb_get_members(c->disk_sb);
|
||||
SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -326,12 +331,16 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
|
||||
*/
|
||||
int bch2_flag_data_bad(struct bch_dev *ca)
|
||||
{
|
||||
int ret = 0;
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bkey_s_c k;
|
||||
struct bkey_s_c_extent e;
|
||||
struct btree_iter iter;
|
||||
int ret = 0;
|
||||
|
||||
bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS,
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
|
||||
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
|
||||
POS_MIN, BTREE_ITER_PREFETCH);
|
||||
|
||||
while ((k = bch2_btree_iter_peek(&iter)).k &&
|
||||
@ -377,10 +386,16 @@ int bch2_flag_data_bad(struct bch_dev *ca)
|
||||
*/
|
||||
continue;
|
||||
advance:
|
||||
if (bkey_extent_is_data(k.k))
|
||||
bch2_check_mark_super(c, bkey_s_c_to_extent(k),
|
||||
BCH_DATA_USER);
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -59,6 +59,8 @@ enum opt_type {
|
||||
s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
|
||||
BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\
|
||||
s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
|
||||
BCH_OPT(degraded, 0444, NO_SB_OPT, \
|
||||
s8, OPT_BOOL()) \
|
||||
BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \
|
||||
s8, OPT_STR(bch2_csum_types)) \
|
||||
BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \
|
||||
|
@ -267,9 +267,6 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
|
||||
}
|
||||
}
|
||||
|
||||
#define BCH_HASH_SET_MUST_CREATE (1 << 4)
|
||||
#define BCH_HASH_SET_MUST_REPLACE (1 << 5)
|
||||
|
||||
static inline int bch2_hash_set(const struct bch_hash_desc desc,
|
||||
const struct bch_hash_info *info,
|
||||
struct bch_fs *c, u64 inode,
|
||||
|
@ -11,6 +11,9 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
|
||||
static const char *bch2_sb_validate_replicas(struct bch_sb *);
|
||||
|
||||
static inline void __bch2_sb_layout_size_assert(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
|
||||
@ -228,8 +231,8 @@ static int u64_cmp(const void *_l, const void *_r)
|
||||
return l < r ? -1 : l > r ? 1 : 0;
|
||||
}
|
||||
|
||||
const char *bch2_validate_journal_layout(struct bch_sb *sb,
|
||||
struct bch_member_cpu mi)
|
||||
const char *bch2_sb_validate_journal(struct bch_sb *sb,
|
||||
struct bch_member_cpu mi)
|
||||
{
|
||||
struct bch_sb_field_journal *journal;
|
||||
const char *err;
|
||||
@ -291,7 +294,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb)
|
||||
return "Invalid superblock: bad member info";
|
||||
|
||||
for (i = 0; i < sb->nr_devices; i++) {
|
||||
if (bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
|
||||
if (!bch2_dev_exists(sb, mi, i))
|
||||
continue;
|
||||
|
||||
if (le16_to_cpu(mi->members[i].bucket_size) <
|
||||
@ -302,7 +305,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
|
||||
const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
|
||||
{
|
||||
struct bch_sb *sb = disk_sb->sb;
|
||||
struct bch_sb_field *f;
|
||||
@ -347,11 +350,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
|
||||
BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
|
||||
return "Invalid number of metadata replicas";
|
||||
|
||||
if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
|
||||
BCH_SB_META_REPLICAS_HAVE(sb) >
|
||||
BCH_SB_META_REPLICAS_WANT(sb))
|
||||
return "Invalid number of metadata replicas";
|
||||
|
||||
if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
|
||||
BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
|
||||
return "Invalid number of data replicas";
|
||||
@ -360,11 +358,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
|
||||
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
|
||||
return "Invalid number of metadata replicas";
|
||||
|
||||
if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
|
||||
BCH_SB_DATA_REPLICAS_HAVE(sb) >
|
||||
BCH_SB_DATA_REPLICAS_WANT(sb))
|
||||
return "Invalid number of data replicas";
|
||||
|
||||
if (!BCH_SB_BTREE_NODE_SIZE(sb))
|
||||
return "Btree node size not set";
|
||||
|
||||
@ -419,7 +412,11 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
|
||||
mi.bucket_size * mi.nbuckets)
|
||||
return "Invalid superblock: device too small";
|
||||
|
||||
err = bch2_validate_journal_layout(sb, mi);
|
||||
err = bch2_sb_validate_journal(sb, mi);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = bch2_sb_validate_replicas(sb);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -464,8 +461,6 @@ static void bch2_sb_update(struct bch_fs *c)
|
||||
c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src);
|
||||
c->sb.nr_devices = src->nr_devices;
|
||||
c->sb.clean = BCH_SB_CLEAN(src);
|
||||
c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
|
||||
c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
|
||||
c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src);
|
||||
c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
|
||||
c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
|
||||
@ -517,6 +512,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
|
||||
unsigned journal_u64s = journal_buckets
|
||||
? le32_to_cpu(journal_buckets->field.u64s)
|
||||
: 0;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&c->sb_lock);
|
||||
|
||||
@ -524,8 +520,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
|
||||
return -ENOMEM;
|
||||
|
||||
__copy_super(c->disk_sb, src);
|
||||
bch2_sb_update(c);
|
||||
|
||||
ret = bch2_sb_replicas_to_cpu_replicas(c);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_sb_update(c);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -743,6 +743,7 @@ void bch2_write_super(struct bch_fs *c)
|
||||
struct closure *cl = &c->sb_write;
|
||||
struct bch_dev *ca;
|
||||
unsigned i, super_idx = 0;
|
||||
const char *err;
|
||||
bool wrote;
|
||||
|
||||
lockdep_assert_held(&c->sb_lock);
|
||||
@ -754,7 +755,16 @@ void bch2_write_super(struct bch_fs *c)
|
||||
for_each_online_member(ca, c, i)
|
||||
bch2_sb_from_fs(c, ca);
|
||||
|
||||
if (c->opts.nochanges)
|
||||
for_each_online_member(ca, c, i) {
|
||||
err = bch2_sb_validate(&ca->disk_sb);
|
||||
if (err) {
|
||||
bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (c->opts.nochanges ||
|
||||
test_bit(BCH_FS_ERROR, &c->flags))
|
||||
goto out;
|
||||
|
||||
do {
|
||||
@ -771,40 +781,482 @@ out:
|
||||
bch2_sb_update(c);
|
||||
}
|
||||
|
||||
void bch2_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k,
|
||||
bool meta)
|
||||
/* replica information: */
|
||||
|
||||
static inline struct bch_replicas_entry *
|
||||
replicas_entry_next(struct bch_replicas_entry *i)
|
||||
{
|
||||
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
|
||||
}
|
||||
|
||||
#define for_each_replicas_entry(_r, _i) \
|
||||
for (_i = (_r)->entries; \
|
||||
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
|
||||
(_i) = replicas_entry_next(_i))
|
||||
|
||||
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
|
||||
unsigned *nr,
|
||||
unsigned *bytes,
|
||||
unsigned *max_dev)
|
||||
{
|
||||
struct bch_replicas_entry *i;
|
||||
unsigned j;
|
||||
|
||||
*nr = 0;
|
||||
*bytes = sizeof(*r);
|
||||
*max_dev = 0;
|
||||
|
||||
if (!r)
|
||||
return;
|
||||
|
||||
for_each_replicas_entry(r, i) {
|
||||
for (j = 0; j < i->nr; j++)
|
||||
*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
|
||||
(*nr)++;
|
||||
}
|
||||
|
||||
*bytes = (void *) i - (void *) r;
|
||||
}
|
||||
|
||||
static struct bch_replicas_cpu *
|
||||
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
|
||||
{
|
||||
struct bch_replicas_cpu *cpu_r;
|
||||
unsigned i, nr, bytes, max_dev, entry_size;
|
||||
|
||||
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
|
||||
|
||||
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
|
||||
DIV_ROUND_UP(max_dev + 1, 8);
|
||||
|
||||
cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
|
||||
nr * entry_size, GFP_NOIO);
|
||||
if (!cpu_r)
|
||||
return NULL;
|
||||
|
||||
cpu_r->nr = nr;
|
||||
cpu_r->entry_size = entry_size;
|
||||
|
||||
if (nr) {
|
||||
struct bch_replicas_cpu_entry *dst =
|
||||
cpu_replicas_entry(cpu_r, 0);
|
||||
struct bch_replicas_entry *src = sb_r->entries;
|
||||
|
||||
while (dst < cpu_replicas_entry(cpu_r, nr)) {
|
||||
dst->data_type = src->data_type;
|
||||
for (i = 0; i < src->nr; i++)
|
||||
replicas_set_dev(dst, src->devs[i]);
|
||||
|
||||
src = replicas_entry_next(src);
|
||||
dst = (void *) dst + entry_size;
|
||||
}
|
||||
}
|
||||
|
||||
eytzinger0_sort(cpu_r->entries,
|
||||
cpu_r->nr,
|
||||
cpu_r->entry_size,
|
||||
memcmp, NULL);
|
||||
return cpu_r;
|
||||
}
|
||||
|
||||
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_replicas *sb_r;
|
||||
struct bch_replicas_cpu *cpu_r, *old_r;
|
||||
|
||||
lockdep_assert_held(&c->sb_lock);
|
||||
|
||||
sb_r = bch2_sb_get_replicas(c->disk_sb);
|
||||
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
|
||||
if (!cpu_r)
|
||||
return -ENOMEM;
|
||||
|
||||
old_r = c->replicas;
|
||||
rcu_assign_pointer(c->replicas, cpu_r);
|
||||
if (old_r)
|
||||
kfree_rcu(old_r, rcu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* for when gc of replica information is in progress:
|
||||
*/
|
||||
static int bch2_update_gc_replicas(struct bch_fs *c,
|
||||
struct bch_replicas_cpu *gc_r,
|
||||
struct bkey_s_c_extent e,
|
||||
enum bch_data_types data_type)
|
||||
{
|
||||
struct bch_member *mi;
|
||||
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
unsigned nr_replicas = 0;
|
||||
struct bch_replicas_cpu_entry *new_e;
|
||||
struct bch_replicas_cpu *new;
|
||||
unsigned i, nr, entry_size, max_dev = 0;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached)
|
||||
max_dev = max_t(unsigned, max_dev, ptr->dev);
|
||||
|
||||
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
|
||||
DIV_ROUND_UP(max_dev + 1, 8);
|
||||
entry_size = max(entry_size, gc_r->entry_size);
|
||||
nr = gc_r->nr + 1;
|
||||
|
||||
new = kzalloc(sizeof(struct bch_replicas_cpu) +
|
||||
nr * entry_size, GFP_NOIO);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
new->nr = nr;
|
||||
new->entry_size = entry_size;
|
||||
|
||||
for (i = 0; i < gc_r->nr; i++)
|
||||
memcpy(cpu_replicas_entry(new, i),
|
||||
cpu_replicas_entry(gc_r, i),
|
||||
gc_r->entry_size);
|
||||
|
||||
new_e = cpu_replicas_entry(new, nr - 1);
|
||||
new_e->data_type = data_type;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached)
|
||||
replicas_set_dev(new_e, ptr->dev);
|
||||
|
||||
eytzinger0_sort(new->entries,
|
||||
new->nr,
|
||||
new->entry_size,
|
||||
memcmp, NULL);
|
||||
|
||||
rcu_assign_pointer(c->replicas_gc, new);
|
||||
kfree_rcu(gc_r, rcu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e,
|
||||
enum bch_data_types data_type)
|
||||
{
|
||||
struct bch_replicas_cpu *gc_r;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_sb_field_replicas *sb_r;
|
||||
struct bch_replicas_entry *new_entry;
|
||||
unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
/* recheck, might have raced */
|
||||
if (bch2_check_super_marked(c, k, meta)) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return;
|
||||
gc_r = rcu_dereference_protected(c->replicas_gc,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
if (gc_r &&
|
||||
!replicas_has_extent(gc_r, e, data_type)) {
|
||||
ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
mi = bch2_sb_get_members(c->disk_sb)->members;
|
||||
/* recheck, might have raced */
|
||||
if (bch2_sb_has_replicas(c, e, data_type)) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
new_entry_bytes = sizeof(struct bch_replicas_entry) +
|
||||
bch2_extent_nr_dirty_ptrs(e.s_c);
|
||||
|
||||
sb_r = bch2_sb_get_replicas(c->disk_sb);
|
||||
|
||||
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
|
||||
|
||||
new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
|
||||
|
||||
sb_r = bch2_fs_sb_resize_replicas(c,
|
||||
DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
|
||||
sizeof(u64)));
|
||||
if (!sb_r) {
|
||||
ret = -ENOSPC;
|
||||
goto err;
|
||||
}
|
||||
|
||||
new_entry = (void *) sb_r + bytes;
|
||||
new_entry->data_type = data_type;
|
||||
new_entry->nr = 0;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached) {
|
||||
(meta
|
||||
? SET_BCH_MEMBER_HAS_METADATA
|
||||
: SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
|
||||
nr_replicas++;
|
||||
}
|
||||
if (!ptr->cached)
|
||||
new_entry->devs[new_entry->nr++] = ptr->dev;
|
||||
|
||||
nr_replicas = min_t(unsigned, nr_replicas,
|
||||
(meta
|
||||
? BCH_SB_META_REPLICAS_HAVE
|
||||
: BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb));
|
||||
(meta
|
||||
? SET_BCH_SB_META_REPLICAS_HAVE
|
||||
: SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas);
|
||||
ret = bch2_sb_replicas_to_cpu_replicas(c);
|
||||
if (ret) {
|
||||
memset(new_entry, 0,
|
||||
vstruct_end(&sb_r->field) - (void *) new_entry);
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch2_write_super(c);
|
||||
err:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
|
||||
struct bch_dev *dev_to_offline)
|
||||
{
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
struct bch_replicas_cpu *r;
|
||||
unsigned i, dev, dev_slots, nr_online, nr_offline;
|
||||
struct replicas_status ret;
|
||||
|
||||
memset(&ret, 0, sizeof(ret));
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
|
||||
ret.replicas[i].nr_online = UINT_MAX;
|
||||
|
||||
rcu_read_lock();
|
||||
r = rcu_dereference(c->replicas);
|
||||
dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
|
||||
|
||||
for (i = 0; i < r->nr; i++) {
|
||||
e = cpu_replicas_entry(r, i);
|
||||
|
||||
BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
|
||||
|
||||
nr_online = nr_offline = 0;
|
||||
|
||||
for (dev = 0; dev < dev_slots; dev++) {
|
||||
if (!replicas_test_dev(e, dev))
|
||||
continue;
|
||||
|
||||
if (bch2_dev_is_online(c->devs[dev]) &&
|
||||
c->devs[dev] != dev_to_offline)
|
||||
nr_online++;
|
||||
else
|
||||
nr_offline++;
|
||||
}
|
||||
|
||||
ret.replicas[e->data_type].nr_online =
|
||||
min(ret.replicas[e->data_type].nr_online,
|
||||
nr_online);
|
||||
|
||||
ret.replicas[e->data_type].nr_offline =
|
||||
max(ret.replicas[e->data_type].nr_offline,
|
||||
nr_offline);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct replicas_status bch2_replicas_status(struct bch_fs *c)
|
||||
{
|
||||
return __bch2_replicas_status(c, NULL);
|
||||
}
|
||||
|
||||
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
|
||||
{
|
||||
struct replicas_status s = bch2_replicas_status(c);
|
||||
|
||||
return meta
|
||||
? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
|
||||
s.replicas[BCH_DATA_BTREE].nr_online)
|
||||
: s.replicas[BCH_DATA_USER].nr_online;
|
||||
}
|
||||
|
||||
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
struct bch_replicas_cpu *r;
|
||||
unsigned i, ret = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
r = rcu_dereference(c->replicas);
|
||||
|
||||
if (ca->dev_idx >= replicas_dev_slots(r))
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < r->nr; i++) {
|
||||
e = cpu_replicas_entry(r, i);
|
||||
|
||||
if (replicas_test_dev(e, ca->dev_idx)) {
|
||||
ret |= 1 << e->data_type;
|
||||
break;
|
||||
}
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
|
||||
{
|
||||
struct bch_sb_field_members *mi;
|
||||
struct bch_sb_field_replicas *sb_r;
|
||||
struct bch_replicas_cpu *cpu_r = NULL;
|
||||
struct bch_replicas_entry *e;
|
||||
const char *err;
|
||||
unsigned i;
|
||||
|
||||
mi = bch2_sb_get_members(sb);
|
||||
sb_r = bch2_sb_get_replicas(sb);
|
||||
if (!sb_r)
|
||||
return NULL;
|
||||
|
||||
for_each_replicas_entry(sb_r, e) {
|
||||
err = "invalid replicas entry: invalid data type";
|
||||
if (e->data_type >= BCH_DATA_NR)
|
||||
goto err;
|
||||
|
||||
err = "invalid replicas entry: too many devices";
|
||||
if (e->nr >= BCH_REPLICAS_MAX)
|
||||
goto err;
|
||||
|
||||
err = "invalid replicas entry: invalid device";
|
||||
for (i = 0; i < e->nr; i++)
|
||||
if (!bch2_dev_exists(sb, mi, e->devs[i]))
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = "cannot allocate memory";
|
||||
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
|
||||
if (!cpu_r)
|
||||
goto err;
|
||||
|
||||
sort_cmp_size(cpu_r->entries,
|
||||
cpu_r->nr,
|
||||
cpu_r->entry_size,
|
||||
memcmp, NULL);
|
||||
|
||||
for (i = 0; i + 1 < cpu_r->nr; i++) {
|
||||
struct bch_replicas_cpu_entry *l =
|
||||
cpu_replicas_entry(cpu_r, i);
|
||||
struct bch_replicas_cpu_entry *r =
|
||||
cpu_replicas_entry(cpu_r, i + 1);
|
||||
|
||||
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
|
||||
|
||||
err = "duplicate replicas entry";
|
||||
if (!memcmp(l, r, cpu_r->entry_size))
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = NULL;
|
||||
err:
|
||||
kfree(cpu_r);
|
||||
return err;
|
||||
}
|
||||
|
||||
int bch2_replicas_gc_end(struct bch_fs *c, int err)
|
||||
{
|
||||
struct bch_sb_field_replicas *sb_r;
|
||||
struct bch_replicas_cpu *r, *old_r;
|
||||
struct bch_replicas_entry *dst_e;
|
||||
size_t i, j, bytes, dev_slots;
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_held(&c->replicas_gc_lock);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
r = rcu_dereference_protected(c->replicas_gc,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
|
||||
if (err) {
|
||||
rcu_assign_pointer(c->replicas_gc, NULL);
|
||||
kfree_rcu(r, rcu);
|
||||
goto err;
|
||||
}
|
||||
|
||||
dev_slots = replicas_dev_slots(r);
|
||||
|
||||
bytes = sizeof(struct bch_sb_field_replicas);
|
||||
|
||||
for (i = 0; i < r->nr; i++) {
|
||||
struct bch_replicas_cpu_entry *e =
|
||||
cpu_replicas_entry(r, i);
|
||||
|
||||
bytes += sizeof(struct bch_replicas_entry);
|
||||
for (j = 0; j < r->entry_size - 1; j++)
|
||||
bytes += hweight8(e->devs[j]);
|
||||
}
|
||||
|
||||
sb_r = bch2_fs_sb_resize_replicas(c,
|
||||
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
|
||||
if (!sb_r) {
|
||||
ret = -ENOSPC;
|
||||
goto err;
|
||||
}
|
||||
|
||||
memset(&sb_r->entries, 0,
|
||||
vstruct_end(&sb_r->field) -
|
||||
(void *) &sb_r->entries);
|
||||
|
||||
dst_e = sb_r->entries;
|
||||
for (i = 0; i < r->nr; i++) {
|
||||
struct bch_replicas_cpu_entry *src_e =
|
||||
cpu_replicas_entry(r, i);
|
||||
|
||||
dst_e->data_type = src_e->data_type;
|
||||
|
||||
for (j = 0; j < dev_slots; j++)
|
||||
if (replicas_test_dev(src_e, j))
|
||||
dst_e->devs[dst_e->nr++] = j;
|
||||
|
||||
dst_e = replicas_entry_next(dst_e);
|
||||
}
|
||||
|
||||
old_r = rcu_dereference_protected(c->replicas,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
rcu_assign_pointer(c->replicas, r);
|
||||
rcu_assign_pointer(c->replicas_gc, NULL);
|
||||
kfree_rcu(old_r, rcu);
|
||||
|
||||
bch2_write_super(c);
|
||||
err:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
|
||||
{
|
||||
struct bch_replicas_cpu *r, *src;
|
||||
unsigned i;
|
||||
|
||||
lockdep_assert_held(&c->replicas_gc_lock);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
BUG_ON(c->replicas_gc);
|
||||
|
||||
src = rcu_dereference_protected(c->replicas,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
|
||||
r = kzalloc(sizeof(struct bch_replicas_cpu) +
|
||||
src->nr * src->entry_size, GFP_NOIO);
|
||||
if (!r) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
r->entry_size = src->entry_size;
|
||||
r->nr = 0;
|
||||
|
||||
for (i = 0; i < src->nr; i++) {
|
||||
struct bch_replicas_cpu_entry *dst_e =
|
||||
cpu_replicas_entry(r, r->nr);
|
||||
struct bch_replicas_cpu_entry *src_e =
|
||||
cpu_replicas_entry(src, i);
|
||||
|
||||
if (!(src_e->data_type & typemask)) {
|
||||
memcpy(dst_e, src_e, r->entry_size);
|
||||
r->nr++;
|
||||
}
|
||||
}
|
||||
|
||||
eytzinger0_sort(r->entries,
|
||||
r->nr,
|
||||
r->entry_size,
|
||||
memcmp, NULL);
|
||||
|
||||
rcu_assign_pointer(c->replicas_gc, r);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define _BCACHE_SUPER_IO_H
|
||||
|
||||
#include "extents.h"
|
||||
#include "eytzinger.h"
|
||||
#include "super_types.h"
|
||||
|
||||
#include <asm/byteorder.h>
|
||||
@ -40,6 +41,15 @@ bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \
|
||||
BCH_SB_FIELD_TYPE(journal);
|
||||
BCH_SB_FIELD_TYPE(members);
|
||||
BCH_SB_FIELD_TYPE(crypt);
|
||||
BCH_SB_FIELD_TYPE(replicas);
|
||||
|
||||
static inline bool bch2_dev_exists(struct bch_sb *sb,
|
||||
struct bch_sb_field_members *mi,
|
||||
unsigned dev)
|
||||
{
|
||||
return dev < sb->nr_devices &&
|
||||
!bch2_is_zero(mi->members[dev].uuid.b, sizeof(uuid_le));
|
||||
}
|
||||
|
||||
static inline bool bch2_sb_test_feature(struct bch_sb *sb,
|
||||
enum bch_sb_features f)
|
||||
@ -91,8 +101,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
|
||||
.bucket_size = le16_to_cpu(mi->bucket_size),
|
||||
.state = BCH_MEMBER_STATE(mi),
|
||||
.tier = BCH_MEMBER_TIER(mi),
|
||||
.has_metadata = BCH_MEMBER_HAS_METADATA(mi),
|
||||
.has_data = BCH_MEMBER_HAS_DATA(mi),
|
||||
.replacement = BCH_MEMBER_REPLACEMENT(mi),
|
||||
.discard = BCH_MEMBER_DISCARD(mi),
|
||||
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
|
||||
@ -105,55 +113,116 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_free_super(struct bcache_superblock *);
|
||||
int bch2_super_realloc(struct bcache_superblock *, unsigned);
|
||||
|
||||
const char *bch2_validate_journal_layout(struct bch_sb *,
|
||||
const char *bch2_sb_validate_journal(struct bch_sb *,
|
||||
struct bch_member_cpu);
|
||||
const char *bch2_validate_cache_super(struct bcache_superblock *);
|
||||
const char *bch2_sb_validate(struct bcache_superblock *);
|
||||
|
||||
const char *bch2_read_super(struct bcache_superblock *,
|
||||
struct bch_opts, const char *);
|
||||
void bch2_write_super(struct bch_fs *);
|
||||
|
||||
void bch2_check_mark_super_slowpath(struct bch_fs *,
|
||||
const struct bkey_i *, bool);
|
||||
|
||||
static inline bool bch2_check_super_marked(struct bch_fs *c,
|
||||
const struct bkey_i *k, bool meta)
|
||||
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
|
||||
unsigned dev)
|
||||
{
|
||||
return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
|
||||
}
|
||||
|
||||
static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
|
||||
unsigned dev)
|
||||
{
|
||||
e->devs[dev >> 3] |= 1 << (dev & 7);
|
||||
}
|
||||
|
||||
static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
|
||||
{
|
||||
return (r->entry_size -
|
||||
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
|
||||
}
|
||||
|
||||
static inline struct bch_replicas_cpu_entry *
|
||||
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
|
||||
{
|
||||
return (void *) r->entries + r->entry_size * i;
|
||||
}
|
||||
|
||||
int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent,
|
||||
enum bch_data_types);
|
||||
|
||||
static inline bool replicas_has_extent(struct bch_replicas_cpu *r,
|
||||
struct bkey_s_c_extent e,
|
||||
enum bch_data_types data_type)
|
||||
{
|
||||
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
unsigned nr_replicas = 0;
|
||||
bool ret = true;
|
||||
struct bch_replicas_cpu_entry search = {
|
||||
.data_type = data_type,
|
||||
};
|
||||
unsigned max_dev = 0;
|
||||
|
||||
extent_for_each_ptr(e, ptr) {
|
||||
struct bch_dev *ca = c->devs[ptr->dev];
|
||||
BUG_ON(!data_type ||
|
||||
data_type == BCH_DATA_SB ||
|
||||
data_type >= BCH_DATA_NR);
|
||||
|
||||
if (ptr->cached)
|
||||
continue;
|
||||
|
||||
if (!(meta
|
||||
? ca->mi.has_metadata
|
||||
: ca->mi.has_data)) {
|
||||
ret = false;
|
||||
break;
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached) {
|
||||
max_dev = max_t(unsigned, max_dev, ptr->dev);
|
||||
replicas_set_dev(&search, ptr->dev);
|
||||
}
|
||||
|
||||
nr_replicas++;
|
||||
}
|
||||
return max_dev < replicas_dev_slots(r) &&
|
||||
eytzinger0_find(r->entries, r->nr,
|
||||
r->entry_size,
|
||||
memcmp, &search) < r->nr;
|
||||
}
|
||||
|
||||
if (nr_replicas <
|
||||
(meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
|
||||
ret = false;
|
||||
static inline bool bch2_sb_has_replicas(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
enum bch_data_types data_type)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
ret = replicas_has_extent(rcu_dereference(c->replicas),
|
||||
e, data_type);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void bch2_check_mark_super(struct bch_fs *c,
|
||||
const struct bkey_i *k, bool meta)
|
||||
static inline int bch2_check_mark_super(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
enum bch_data_types data_type)
|
||||
{
|
||||
if (bch2_check_super_marked(c, k, meta))
|
||||
return;
|
||||
struct bch_replicas_cpu *gc_r;
|
||||
bool marked;
|
||||
|
||||
bch2_check_mark_super_slowpath(c, k, meta);
|
||||
rcu_read_lock();
|
||||
marked = replicas_has_extent(rcu_dereference(c->replicas),
|
||||
e, data_type) &&
|
||||
(!(gc_r = rcu_dereference(c->replicas_gc)) ||
|
||||
replicas_has_extent(gc_r, e, data_type));
|
||||
rcu_read_unlock();
|
||||
|
||||
if (marked)
|
||||
return 0;
|
||||
|
||||
return bch2_check_mark_super_slowpath(c, e, data_type);
|
||||
}
|
||||
|
||||
struct replicas_status {
|
||||
struct {
|
||||
unsigned nr_online;
|
||||
unsigned nr_offline;
|
||||
} replicas[BCH_DATA_NR];
|
||||
};
|
||||
|
||||
struct replicas_status __bch2_replicas_status(struct bch_fs *,
|
||||
struct bch_dev *);
|
||||
struct replicas_status bch2_replicas_status(struct bch_fs *);
|
||||
|
||||
unsigned bch2_replicas_online(struct bch_fs *, bool);
|
||||
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
int bch2_replicas_gc_end(struct bch_fs *, int);
|
||||
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
|
||||
|
||||
#endif /* _BCACHE_SUPER_IO_H */
|
||||
|
@ -224,6 +224,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
|
||||
bch2_dev_allocator_stop(ca);
|
||||
|
||||
bch2_fs_journal_stop(&c->journal);
|
||||
|
||||
for_each_member_device(ca, c, i)
|
||||
bch2_dev_allocator_remove(c, ca);
|
||||
}
|
||||
|
||||
static void bch2_writes_disabled(struct percpu_ref *writes)
|
||||
@ -330,6 +333,10 @@ const char *bch2_fs_read_write(struct bch_fs *c)
|
||||
c->state != BCH_FS_RO)
|
||||
goto out;
|
||||
|
||||
for_each_rw_member(ca, c, i)
|
||||
bch2_dev_allocator_add(c, ca);
|
||||
bch2_recalc_capacity(c);
|
||||
|
||||
err = "error starting allocator thread";
|
||||
for_each_rw_member(ca, c, i)
|
||||
if (bch2_dev_allocator_start(ca)) {
|
||||
@ -484,6 +491,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
mutex_init(&c->state_lock);
|
||||
mutex_init(&c->sb_lock);
|
||||
mutex_init(&c->replicas_gc_lock);
|
||||
mutex_init(&c->btree_cache_lock);
|
||||
mutex_init(&c->bucket_lock);
|
||||
mutex_init(&c->btree_root_lock);
|
||||
@ -603,7 +611,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
mi = bch2_sb_get_members(c->disk_sb);
|
||||
for (i = 0; i < c->sb.nr_devices; i++)
|
||||
if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
|
||||
if (bch2_dev_exists(c->disk_sb, mi, i) &&
|
||||
bch2_dev_alloc(c, i))
|
||||
goto err;
|
||||
|
||||
@ -681,12 +689,16 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
const char *err = "cannot allocate memory";
|
||||
struct bch_sb_field_members *mi;
|
||||
struct bch_dev *ca;
|
||||
unsigned i, id;
|
||||
time64_t now;
|
||||
LIST_HEAD(journal);
|
||||
struct jset *j;
|
||||
struct closure cl;
|
||||
u64 journal_seq = 0;
|
||||
time64_t now;
|
||||
unsigned i;
|
||||
int ret = -EINVAL;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
BUG_ON(c->state != BCH_FS_STARTING);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
@ -694,6 +706,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
bch2_sb_from_fs(c, ca);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
for_each_rw_member(ca, c, i)
|
||||
bch2_dev_allocator_add(c, ca);
|
||||
bch2_recalc_capacity(c);
|
||||
|
||||
if (BCH_SB_INITIALIZED(c->disk_sb)) {
|
||||
ret = bch2_journal_read(c, &journal);
|
||||
if (ret)
|
||||
@ -704,44 +720,45 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
|
||||
c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
|
||||
|
||||
err = "error reading priorities";
|
||||
for_each_readable_member(ca, c, i) {
|
||||
ret = bch2_prio_read(ca);
|
||||
if (ret) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
for (id = 0; id < BTREE_ID_NR; id++) {
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
unsigned level;
|
||||
struct bkey_i *k;
|
||||
|
||||
err = "bad btree root";
|
||||
k = bch2_journal_find_btree_root(c, j, id, &level);
|
||||
if (!k && id == BTREE_ID_EXTENTS)
|
||||
err = "missing btree root";
|
||||
k = bch2_journal_find_btree_root(c, j, i, &level);
|
||||
if (!k && i < BTREE_ID_ALLOC)
|
||||
goto err;
|
||||
if (!k) {
|
||||
pr_debug("missing btree root: %d", id);
|
||||
|
||||
if (!k)
|
||||
continue;
|
||||
}
|
||||
|
||||
err = "error reading btree root";
|
||||
if (bch2_btree_root_read(c, id, k, level))
|
||||
if (bch2_btree_root_read(c, i, k, level))
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch_verbose(c, "starting mark and sweep:");
|
||||
err = "error reading allocation information";
|
||||
ret = bch2_alloc_read(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch_verbose(c, "starting mark and sweep:");
|
||||
err = "error in recovery";
|
||||
ret = bch2_initial_gc(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
bch_verbose(c, "mark and sweep done");
|
||||
|
||||
if (c->opts.noreplay)
|
||||
goto recovery_done;
|
||||
|
||||
bch_verbose(c, "mark and sweep done");
|
||||
err = "cannot allocate new btree root";
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
if (!c->btree_roots[i].b &&
|
||||
bch2_btree_root_alloc(c, i, &cl))
|
||||
goto err;
|
||||
|
||||
closure_sync(&cl);
|
||||
|
||||
/*
|
||||
* bch2_journal_start() can't happen sooner, or btree_gc_finish()
|
||||
@ -758,12 +775,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
}
|
||||
|
||||
bch_verbose(c, "starting journal replay:");
|
||||
|
||||
err = "journal replay failed";
|
||||
ret = bch2_journal_replay(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch_verbose(c, "journal replay done");
|
||||
|
||||
if (c->opts.norecovery)
|
||||
@ -774,23 +789,21 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
ret = bch2_fsck(c, !c->opts.nofsck);
|
||||
if (ret)
|
||||
goto err;
|
||||
bch_verbose(c, "fsck done");
|
||||
|
||||
for_each_rw_member(ca, c, i)
|
||||
if (ca->need_prio_write) {
|
||||
ret = bch2_prio_write(ca);
|
||||
if (ca->need_alloc_write) {
|
||||
ret = bch2_alloc_write(c, ca, &journal_seq);
|
||||
if (ret) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
bch_verbose(c, "fsck done");
|
||||
bch2_journal_flush_seq(&c->journal, journal_seq);
|
||||
} else {
|
||||
struct bch_inode_unpacked inode;
|
||||
struct bkey_inode_buf packed_inode;
|
||||
struct closure cl;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
bch_notice(c, "initializing new filesystem");
|
||||
|
||||
@ -805,6 +818,11 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = "cannot allocate new btree root";
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
if (bch2_btree_root_alloc(c, i, &cl))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* journal_res_get() will crash if called before this has
|
||||
* set up the journal.pin FIFO and journal.cur pointer:
|
||||
@ -819,13 +837,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = "cannot allocate new btree root";
|
||||
for (id = 0; id < BTREE_ID_NR; id++)
|
||||
if (bch2_btree_root_alloc(c, id, &cl)) {
|
||||
closure_sync(&cl);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* Wait for new btree roots to be written: */
|
||||
closure_sync(&cl);
|
||||
|
||||
@ -877,6 +888,8 @@ out:
|
||||
bch2_journal_entries_free(&journal);
|
||||
return err;
|
||||
err:
|
||||
closure_sync(&cl);
|
||||
|
||||
switch (ret) {
|
||||
case BCH_FSCK_ERRORS_NOT_FIXED:
|
||||
bch_err(c, "filesystem contains errors: please report this to the developers");
|
||||
@ -940,10 +953,7 @@ static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
|
||||
if (uuid_le_cmp(fs->uuid, sb->uuid))
|
||||
return "device not a member of filesystem";
|
||||
|
||||
if (sb->dev_idx >= newest->nr_devices)
|
||||
return "device has invalid dev_idx";
|
||||
|
||||
if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
|
||||
if (!bch2_dev_exists(newest, mi, sb->dev_idx))
|
||||
return "device has been removed";
|
||||
|
||||
if (fs->block_size != sb->block_size)
|
||||
@ -981,9 +991,6 @@ static void bch2_dev_free(struct bch_dev *ca)
|
||||
free_percpu(ca->sectors_written);
|
||||
bioset_exit(&ca->replica_set);
|
||||
free_percpu(ca->usage_percpu);
|
||||
kvpfree(ca->disk_buckets, bucket_bytes(ca));
|
||||
kfree(ca->prio_buckets);
|
||||
kfree(ca->bio_prio);
|
||||
kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket));
|
||||
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
|
||||
free_heap(&ca->copygc_heap);
|
||||
@ -1011,7 +1018,7 @@ static void __bch2_dev_offline(struct bch_dev *ca)
|
||||
|
||||
lockdep_assert_held(&c->state_lock);
|
||||
|
||||
__bch2_dev_read_only(ca->fs, ca);
|
||||
__bch2_dev_read_only(c, ca);
|
||||
|
||||
reinit_completion(&ca->offline_complete);
|
||||
percpu_ref_kill(&ca->io_ref);
|
||||
@ -1061,7 +1068,7 @@ static int bch2_dev_sysfs_online(struct bch_dev *ca)
|
||||
return 0;
|
||||
|
||||
if (!ca->kobj.state_in_sysfs) {
|
||||
ret = kobject_add(&ca->kobj, &ca->fs->kobj,
|
||||
ret = kobject_add(&ca->kobj, &c->kobj,
|
||||
"dev-%u", ca->dev_idx);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -1087,7 +1094,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
||||
struct bch_member *member;
|
||||
size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
|
||||
size_t heap_size;
|
||||
unsigned i;
|
||||
unsigned i, btree_node_reserve_buckets;
|
||||
struct bch_dev *ca;
|
||||
|
||||
if (bch2_fs_init_fault("dev_alloc"))
|
||||
@ -1107,8 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
||||
ca->dev_idx = dev_idx;
|
||||
|
||||
spin_lock_init(&ca->freelist_lock);
|
||||
spin_lock_init(&ca->prio_buckets_lock);
|
||||
mutex_init(&ca->prio_write_lock);
|
||||
bch2_dev_moving_gc_init(ca);
|
||||
|
||||
INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work);
|
||||
@ -1134,12 +1139,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
||||
free_inc_reserve = movinggc_reserve / 2;
|
||||
heap_size = movinggc_reserve * 8;
|
||||
|
||||
btree_node_reserve_buckets =
|
||||
DIV_ROUND_UP(BTREE_NODE_RESERVE,
|
||||
ca->mi.bucket_size / c->sb.btree_node_size);
|
||||
|
||||
if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
|
||||
0, GFP_KERNEL) ||
|
||||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
|
||||
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
|
||||
!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
|
||||
!init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
|
||||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
|
||||
GFP_KERNEL) ||
|
||||
!init_fifo(&ca->free[RESERVE_MOVINGGC],
|
||||
movinggc_reserve, GFP_KERNEL) ||
|
||||
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
|
||||
@ -1152,18 +1161,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
||||
!(ca->buckets = kvpmalloc(ca->mi.nbuckets *
|
||||
sizeof(struct bucket),
|
||||
GFP_KERNEL|__GFP_ZERO)) ||
|
||||
!(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
|
||||
2, GFP_KERNEL)) ||
|
||||
!(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
|
||||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
|
||||
!(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
|
||||
bioset_init(&ca->replica_set, 4,
|
||||
offsetof(struct bch_write_bio, bio)) ||
|
||||
!(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
|
||||
goto err;
|
||||
|
||||
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
|
||||
|
||||
total_reserve = ca->free_inc.size;
|
||||
for (i = 0; i < RESERVE_NR; i++)
|
||||
total_reserve += ca->free[i].size;
|
||||
@ -1232,53 +1235,48 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
|
||||
|
||||
lg_local_lock(&c->usage_lock);
|
||||
if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
|
||||
bch2_mark_dev_metadata(ca->fs, ca);
|
||||
bch2_mark_dev_metadata(c, ca);
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
|
||||
struct bch_sb_field_journal *journal_buckets =
|
||||
bch2_sb_get_journal(ca->disk_sb.sb);
|
||||
bool has_journal =
|
||||
bch2_nr_journal_buckets(journal_buckets) >=
|
||||
BCH_JOURNAL_BUCKETS_MIN;
|
||||
|
||||
bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
|
||||
bch2_dev_group_add(&c->all_devs, ca);
|
||||
|
||||
if (has_journal)
|
||||
bch2_dev_group_add(&c->journal.devs, ca);
|
||||
}
|
||||
|
||||
percpu_ref_reinit(&ca->io_ref);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Device management: */
|
||||
|
||||
bool bch2_fs_may_start(struct bch_fs *c, int flags)
|
||||
static bool have_enough_devs(struct bch_fs *c,
|
||||
struct replicas_status s,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bch_sb_field_members *mi;
|
||||
unsigned meta_missing = 0;
|
||||
unsigned data_missing = 0;
|
||||
bool degraded = false;
|
||||
unsigned i;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
mi = bch2_sb_get_members(c->disk_sb);
|
||||
|
||||
for (i = 0; i < c->disk_sb->nr_devices; i++)
|
||||
if (!c->devs[i] &&
|
||||
!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
|
||||
degraded = true;
|
||||
if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
|
||||
meta_missing++;
|
||||
if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
|
||||
data_missing++;
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
if (degraded &&
|
||||
!(flags & BCH_FORCE_IF_DEGRADED))
|
||||
return false;
|
||||
|
||||
if (meta_missing &&
|
||||
if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
|
||||
s.replicas[BCH_DATA_BTREE].nr_offline) &&
|
||||
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
|
||||
return false;
|
||||
|
||||
if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
|
||||
if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
|
||||
!s.replicas[BCH_DATA_BTREE].nr_online) &&
|
||||
!(flags & BCH_FORCE_IF_METADATA_LOST))
|
||||
return false;
|
||||
|
||||
if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
|
||||
if (s.replicas[BCH_DATA_USER].nr_offline &&
|
||||
!(flags & BCH_FORCE_IF_DATA_DEGRADED))
|
||||
return false;
|
||||
|
||||
if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
|
||||
if (!s.replicas[BCH_DATA_USER].nr_online &&
|
||||
!(flags & BCH_FORCE_IF_DATA_LOST))
|
||||
return false;
|
||||
|
||||
@ -1297,40 +1295,80 @@ bool bch2_fs_may_start(struct bch_fs *c, int flags)
|
||||
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
|
||||
enum bch_member_state new_state, int flags)
|
||||
{
|
||||
struct replicas_status s;
|
||||
struct bch_dev *ca2;
|
||||
int i, nr_rw = 0, required;
|
||||
|
||||
lockdep_assert_held(&c->state_lock);
|
||||
|
||||
if (new_state == BCH_MEMBER_STATE_RW)
|
||||
switch (new_state) {
|
||||
case BCH_MEMBER_STATE_RW:
|
||||
return true;
|
||||
case BCH_MEMBER_STATE_RO:
|
||||
if (ca->mi.state != BCH_MEMBER_STATE_RW)
|
||||
return true;
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
|
||||
return true;
|
||||
/* do we have enough devices to write to? */
|
||||
for_each_member_device(ca2, c, i)
|
||||
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
|
||||
|
||||
/*
|
||||
* If the device is already offline - whatever is going on with it can't
|
||||
* possible make the FS need to go RO:
|
||||
*/
|
||||
if (!bch2_dev_is_online(ca))
|
||||
return true;
|
||||
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
|
||||
? c->opts.metadata_replicas
|
||||
: c->opts.metadata_replicas_required,
|
||||
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
|
||||
? c->opts.data_replicas
|
||||
: c->opts.data_replicas_required);
|
||||
|
||||
if (ca->mi.has_data &&
|
||||
!(flags & BCH_FORCE_IF_DATA_DEGRADED))
|
||||
return false;
|
||||
return nr_rw - 1 <= required;
|
||||
case BCH_MEMBER_STATE_FAILED:
|
||||
case BCH_MEMBER_STATE_SPARE:
|
||||
if (ca->mi.state != BCH_MEMBER_STATE_RW &&
|
||||
ca->mi.state != BCH_MEMBER_STATE_RO)
|
||||
return true;
|
||||
|
||||
if (ca->mi.has_data &&
|
||||
c->sb.data_replicas_have <= 1 &&
|
||||
!(flags & BCH_FORCE_IF_DATA_LOST))
|
||||
return false;
|
||||
/* do we have enough devices to read from? */
|
||||
s = __bch2_replicas_status(c, ca);
|
||||
|
||||
if (ca->mi.has_metadata &&
|
||||
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
|
||||
return false;
|
||||
pr_info("replicas: j %u %u b %u %u d %u %u",
|
||||
s.replicas[BCH_DATA_JOURNAL].nr_online,
|
||||
s.replicas[BCH_DATA_JOURNAL].nr_offline,
|
||||
|
||||
if (ca->mi.has_metadata &&
|
||||
c->sb.meta_replicas_have <= 1 &&
|
||||
!(flags & BCH_FORCE_IF_METADATA_LOST))
|
||||
return false;
|
||||
s.replicas[BCH_DATA_BTREE].nr_online,
|
||||
s.replicas[BCH_DATA_BTREE].nr_offline,
|
||||
|
||||
return true;
|
||||
s.replicas[BCH_DATA_USER].nr_online,
|
||||
s.replicas[BCH_DATA_USER].nr_offline);
|
||||
|
||||
return have_enough_devs(c, s, flags);
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static bool bch2_fs_may_start(struct bch_fs *c, int flags)
|
||||
{
|
||||
struct replicas_status s;
|
||||
struct bch_sb_field_members *mi;
|
||||
unsigned i;
|
||||
|
||||
if (!c->opts.degraded) {
|
||||
mutex_lock(&c->sb_lock);
|
||||
mi = bch2_sb_get_members(c->disk_sb);
|
||||
|
||||
for (i = 0; i < c->disk_sb->nr_devices; i++)
|
||||
if (bch2_dev_exists(c->disk_sb, mi, i) &&
|
||||
!bch2_dev_is_online(c->devs[i]) &&
|
||||
(c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
|
||||
c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return false;
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
||||
s = bch2_replicas_status(c);
|
||||
|
||||
return have_enough_devs(c, s, flags);
|
||||
}
|
||||
|
||||
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
|
||||
@ -1343,8 +1381,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
|
||||
* complete.
|
||||
*/
|
||||
bch2_dev_allocator_stop(ca);
|
||||
|
||||
bch2_dev_group_remove(&c->journal.devs, ca);
|
||||
bch2_dev_allocator_remove(c, ca);
|
||||
}
|
||||
|
||||
static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
|
||||
@ -1353,6 +1390,9 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
|
||||
|
||||
BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
|
||||
|
||||
bch2_dev_allocator_add(c, ca);
|
||||
bch2_recalc_capacity(c);
|
||||
|
||||
if (bch2_dev_allocator_start(ca))
|
||||
return "error starting allocator thread";
|
||||
|
||||
@ -1411,7 +1451,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
|
||||
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
{
|
||||
struct bch_sb_field_members *mi;
|
||||
unsigned dev_idx = ca->dev_idx;
|
||||
unsigned dev_idx = ca->dev_idx, data;
|
||||
int ret = -EINVAL;
|
||||
|
||||
mutex_lock(&c->state_lock);
|
||||
@ -1439,19 +1479,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (ca->mi.has_data || ca->mi.has_metadata) {
|
||||
bch_err(ca, "Remove failed, still has data");
|
||||
data = bch2_dev_has_data(c, ca);
|
||||
if (data) {
|
||||
bch_err(ca, "Remove failed, still has data (%x)", data);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ok, really doing the remove:
|
||||
* Drop device's prio pointer before removing it from superblock:
|
||||
*/
|
||||
spin_lock(&c->journal.lock);
|
||||
c->journal.prio_buckets[dev_idx] = 0;
|
||||
spin_unlock(&c->journal.lock);
|
||||
|
||||
bch2_journal_meta(&c->journal);
|
||||
|
||||
__bch2_dev_offline(ca);
|
||||
@ -1476,6 +1509,7 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Add new device to running filesystem: */
|
||||
int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
{
|
||||
struct bcache_superblock sb;
|
||||
@ -1490,7 +1524,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
|
||||
err = bch2_validate_cache_super(&sb);
|
||||
err = bch2_sb_validate(&sb);
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
|
||||
@ -1514,9 +1548,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
|
||||
mi = bch2_sb_get_members(c->disk_sb);
|
||||
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
|
||||
if (dev_idx >= c->sb.nr_devices ||
|
||||
bch2_is_zero(mi->members[dev_idx].uuid.b,
|
||||
sizeof(uuid_le)))
|
||||
if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
|
||||
goto have_slot;
|
||||
no_slot:
|
||||
err = "no slots available in superblock";
|
||||
@ -1587,13 +1619,13 @@ err:
|
||||
return ret ?: -EINVAL;
|
||||
}
|
||||
|
||||
/* Hot add existing device to running filesystem: */
|
||||
int bch2_dev_online(struct bch_fs *c, const char *path)
|
||||
{
|
||||
struct bcache_superblock sb = { 0 };
|
||||
struct bch_dev *ca;
|
||||
unsigned dev_idx;
|
||||
const char *err;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->state_lock);
|
||||
|
||||
@ -1616,12 +1648,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
ca = c->devs[dev_idx];
|
||||
ret = bch2_prio_read(ca);
|
||||
if (ret) {
|
||||
err = "error reading priorities";
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
|
||||
err = __bch2_dev_read_write(c, ca);
|
||||
if (err)
|
||||
@ -1656,6 +1682,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
|
||||
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
unsigned data;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->state_lock);
|
||||
@ -1680,8 +1707,9 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ca->mi.has_data || ca->mi.has_metadata) {
|
||||
bch_err(ca, "Migrate error: data still present");
|
||||
data = bch2_dev_has_data(c, ca);
|
||||
if (data) {
|
||||
bch_err(ca, "Migrate error: data still present (%x)", data);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -1714,11 +1742,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
err = "attempting to register backing device";
|
||||
if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
|
||||
goto err;
|
||||
|
||||
err = bch2_validate_cache_super(&sb[i]);
|
||||
err = bch2_sb_validate(&sb[i]);
|
||||
if (err)
|
||||
goto err;
|
||||
}
|
||||
@ -1790,7 +1814,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb,
|
||||
struct bch_fs *c;
|
||||
bool allocated_fs = false;
|
||||
|
||||
err = bch2_validate_cache_super(sb);
|
||||
err = bch2_sb_validate(sb);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@ -1855,11 +1879,7 @@ const char *bch2_fs_open_incremental(const char *path)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
|
||||
err = __bch2_fs_open_incremental(&sb, opts);
|
||||
else
|
||||
err = "not a bcachefs superblock";
|
||||
|
||||
err = __bch2_fs_open_incremental(&sb, opts);
|
||||
bch2_free_super(&sb);
|
||||
|
||||
return err;
|
||||
|
@ -337,8 +337,8 @@ SHOW(bch2_fs)
|
||||
|
||||
sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
|
||||
|
||||
sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
|
||||
sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
|
||||
sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
|
||||
sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
|
||||
|
||||
/* Debugging: */
|
||||
|
||||
@ -693,7 +693,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
|
||||
|
||||
return scnprintf(buf, PAGE_SIZE,
|
||||
"free_inc: %zu/%zu\n"
|
||||
"free[RESERVE_PRIO]: %zu/%zu\n"
|
||||
"free[RESERVE_BTREE]: %zu/%zu\n"
|
||||
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
|
||||
"free[RESERVE_NONE]: %zu/%zu\n"
|
||||
@ -705,7 +704,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
|
||||
"open buckets: %u/%u (reserved %u)\n"
|
||||
"open_buckets_wait: %s\n",
|
||||
fifo_used(&ca->free_inc), ca->free_inc.size,
|
||||
fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size,
|
||||
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
|
||||
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
|
||||
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
|
||||
@ -759,8 +757,11 @@ SHOW(bch2_dev)
|
||||
sysfs_print(alloc_buckets, stats.buckets_alloc);
|
||||
sysfs_print(available_buckets, dev_buckets_available(ca));
|
||||
sysfs_print(free_buckets, dev_buckets_free(ca));
|
||||
sysfs_print(has_data, ca->mi.has_data);
|
||||
sysfs_print(has_metadata, ca->mi.has_metadata);
|
||||
sysfs_print(has_data, bch2_dev_has_data(c, ca) &
|
||||
(1 << BCH_DATA_USER));
|
||||
sysfs_print(has_metadata, bch2_dev_has_data(c, ca) &
|
||||
((1 << BCH_DATA_JOURNAL)|
|
||||
(1 << BCH_DATA_BTREE)));
|
||||
|
||||
sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
|
||||
|
||||
|
@ -533,3 +533,47 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void sort_cmp_size(void *base, size_t num, size_t size,
|
||||
int (*cmp_func)(const void *, const void *, size_t),
|
||||
void (*swap_func)(void *, void *, size_t size))
|
||||
{
|
||||
/* pre-scale counters for performance */
|
||||
int i = (num/2 - 1) * size, n = num * size, c, r;
|
||||
|
||||
if (!swap_func) {
|
||||
if (size == 4 && alignment_ok(base, 4))
|
||||
swap_func = u32_swap;
|
||||
else if (size == 8 && alignment_ok(base, 8))
|
||||
swap_func = u64_swap;
|
||||
else
|
||||
swap_func = generic_swap;
|
||||
}
|
||||
|
||||
/* heapify */
|
||||
for ( ; i >= 0; i -= size) {
|
||||
for (r = i; r * 2 + size < n; r = c) {
|
||||
c = r * 2 + size;
|
||||
if (c < n - size &&
|
||||
cmp_func(base + c, base + c + size, size) < 0)
|
||||
c += size;
|
||||
if (cmp_func(base + r, base + c, size) >= 0)
|
||||
break;
|
||||
swap_func(base + r, base + c, size);
|
||||
}
|
||||
}
|
||||
|
||||
/* sort */
|
||||
for (i = n - size; i > 0; i -= size) {
|
||||
swap_func(base, base + i, size);
|
||||
for (r = 0; r * 2 + size < i; r = c) {
|
||||
c = r * 2 + size;
|
||||
if (c < i - size &&
|
||||
cmp_func(base + c, base + c + size, size) < 0)
|
||||
c += size;
|
||||
if (cmp_func(base + r, base + c, size) >= 0)
|
||||
break;
|
||||
swap_func(base + r, base + c, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -763,4 +763,8 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio,
|
||||
|
||||
size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
|
||||
|
||||
void sort_cmp_size(void *base, size_t num, size_t size,
|
||||
int (*cmp_func)(const void *, const void *, size_t),
|
||||
void (*swap_func)(void *, void *, size_t));
|
||||
|
||||
#endif /* _BCACHE_UTIL_H */
|
||||
|
Loading…
Reference in New Issue
Block a user