Update bcachefs sources to 9ceb982d77 bcachefs: Store bucket gens in a btree

This commit is contained in:
Kent Overstreet 2017-05-08 02:28:15 -08:00
parent e57a624feb
commit 63065c0128
35 changed files with 1788 additions and 1164 deletions

View File

@ -1 +1 @@
4231dd5cf0f04dd61b0b8bae44a357da8331c0e2
9ceb982d7790f552e2f5c96bebeab176516cf144

View File

@ -55,12 +55,6 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
bucket_bytes(ca));
}
/* Prios/gens: */
for (i = 0; i < prio_buckets(ca); i++)
range_add(&data,
bucket_bytes(ca) * ca->prio_last_buckets[i],
bucket_bytes(ca));
/* Btree: */
for (i = 0; i < BTREE_ID_NR; i++) {
const struct bch_extent_ptr *ptr;
@ -97,6 +91,7 @@ int cmd_dump(int argc, char *argv[])
opts.nochanges = true;
opts.noreplay = true;
opts.errors = BCH_ON_ERROR_CONTINUE;
opts.degraded = true;
while ((opt = getopt(argc, argv, "o:fh")) != -1)
switch (opt) {
@ -273,6 +268,7 @@ int cmd_list(int argc, char *argv[])
opts.nochanges = true;
opts.norecovery = true;
opts.errors = BCH_ON_ERROR_CONTINUE;
opts.degraded = true;
while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
switch (opt) {

View File

@ -27,6 +27,8 @@ int cmd_fsck(int argc, char *argv[])
const char *err;
int opt;
opts.degraded = true;
while ((opt = getopt(argc, argv, "pynfvh")) != -1)
switch (opt) {
case 'p':

View File

@ -333,7 +333,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
die("error reserving space in new filesystem: %s",
strerror(-ret));
bch2_check_mark_super(c, &e->k_i, false);
bch2_check_mark_super(c, extent_i_to_s_c(e), false);
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
&res, NULL, NULL, 0);

View File

@ -112,6 +112,11 @@ static inline unsigned long hweight_long(unsigned long w)
return __builtin_popcountl(w);
}
static inline unsigned long hweight8(unsigned long w)
{
return __builtin_popcountl(w);
}
/**
* rol64 - rotate a 64-bit value left
* @word: value to rotate

View File

@ -176,10 +176,8 @@ struct bch_sb *bch2_format(struct format_opts opts,
SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size);
SET_BCH_SB_GC_RESERVE(sb, 8);
SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas);
SET_BCH_SB_META_REPLICAS_HAVE(sb, opts.meta_replicas);
SET_BCH_SB_META_REPLICAS_REQ(sb, opts.meta_replicas_required);
SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas);
SET_BCH_SB_DATA_REPLICAS_HAVE(sb, opts.data_replicas);
SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required);
SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action);
SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH);
@ -339,9 +337,9 @@ void bch2_super_print(struct bch_sb *sb, int units)
BCH_SB_CLEAN(sb),
BCH_SB_META_REPLICAS_HAVE(sb),
0LLU, //BCH_SB_META_REPLICAS_HAVE(sb),
BCH_SB_META_REPLICAS_WANT(sb),
BCH_SB_DATA_REPLICAS_HAVE(sb),
0LLU, //BCH_SB_DATA_REPLICAS_HAVE(sb),
BCH_SB_DATA_REPLICAS_WANT(sb),
BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR
@ -405,8 +403,8 @@ void bch2_super_print(struct bch_sb *sb, int units)
: "unknown",
BCH_MEMBER_TIER(m),
BCH_MEMBER_HAS_METADATA(m),
BCH_MEMBER_HAS_DATA(m),
0LLU, //BCH_MEMBER_HAS_METADATA(m),
0LLU, //BCH_MEMBER_HAS_DATA(m),
BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]

File diff suppressed because it is too large Load Diff

View File

@ -10,24 +10,14 @@ struct bch_dev;
struct bch_fs;
struct dev_group;
static inline size_t prios_per_bucket(const struct bch_dev *ca)
{
return (bucket_bytes(ca) - sizeof(struct prio_set)) /
sizeof(struct bucket_disk);
}
static inline size_t prio_buckets(const struct bch_dev *ca)
{
return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
}
void bch2_dev_group_remove(struct dev_group *, struct bch_dev *);
void bch2_dev_group_add(struct dev_group *, struct bch_dev *);
int bch2_prio_read(struct bch_dev *);
int bch2_prio_write(struct bch_dev *);
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
size_t bch2_bucket_alloc(struct bch_dev *, enum alloc_reserve);
long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
@ -80,8 +70,15 @@ static inline struct bch_dev *dev_group_next(struct dev_group *devs,
(_ptr)++)
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
void bch2_fs_allocator_init(struct bch_fs *);
extern const struct bkey_ops bch2_bkey_alloc_ops;
#endif /* _BCACHE_ALLOC_H */

View File

@ -35,20 +35,13 @@ struct prio_clock {
/* There is one reserve for each type of btree, one for prios and gens
* and one for moving GC */
enum alloc_reserve {
RESERVE_PRIO,
RESERVE_BTREE,
RESERVE_METADATA_LAST = RESERVE_BTREE,
RESERVE_MOVINGGC,
RESERVE_NONE,
RESERVE_NR,
RESERVE_ALLOC = -1,
RESERVE_BTREE = 0,
RESERVE_MOVINGGC = 1,
RESERVE_NONE = 2,
RESERVE_NR = 3,
};
static inline bool allocation_is_metadata(enum alloc_reserve id)
{
return id <= RESERVE_METADATA_LAST;
}
struct dev_group {
spinlock_t lock;
unsigned nr;

View File

@ -305,7 +305,7 @@ do { \
(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
/* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2)
#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
struct btree;
struct crypto_blkcipher;
@ -329,13 +329,23 @@ struct bch_member_cpu {
u16 bucket_size; /* sectors */
u8 state;
u8 tier;
u8 has_metadata;
u8 has_data;
u8 replacement;
u8 discard;
u8 valid;
};
struct bch_replicas_cpu_entry {
u8 data_type;
u8 devs[BCH_SB_MEMBERS_MAX / 8];
};
struct bch_replicas_cpu {
struct rcu_head rcu;
unsigned nr;
unsigned entry_size;
struct bch_replicas_cpu_entry entries[];
};
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
@ -363,21 +373,7 @@ struct bch_dev {
struct task_struct *alloc_thread;
struct prio_set *disk_buckets;
/*
* When allocating new buckets, prio_write() gets first dibs - since we
* may not be allocate at all without writing priorities and gens.
* prio_last_buckets[] contains the last buckets we wrote priorities to
* (so gc can mark them as metadata).
*/
u64 *prio_buckets;
u64 *prio_last_buckets;
spinlock_t prio_buckets_lock;
struct bio *bio_prio;
bool prio_read_done;
bool need_prio_write;
struct mutex prio_write_lock;
bool need_alloc_write;
/*
* free: Buckets that are ready to be used
@ -391,6 +387,7 @@ struct bch_dev {
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
spinlock_t freelist_lock;
bool alloc_thread_started;
size_t fifo_last_bucket;
@ -415,6 +412,8 @@ struct bch_dev {
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
u64 allocator_journal_seq_flush;
bool allocator_invalidating_data;
alloc_heap alloc_heap;
bucket_heap copygc_heap;
@ -458,6 +457,7 @@ enum {
BCH_FS_FSCK_FIXED_ERRORS,
BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS,
BCH_FS_REBUILD_REPLICAS,
};
struct btree_debug {
@ -507,6 +507,10 @@ struct bch_fs {
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
struct bch_replicas_cpu __rcu *replicas;
struct bch_replicas_cpu __rcu *replicas_gc;
struct mutex replicas_gc_lock;
struct bch_opts opts;
/* Updated by bch2_sb_update():*/
@ -520,9 +524,6 @@ struct bch_fs {
u8 nr_devices;
u8 clean;
u8 meta_replicas_have;
u8 data_replicas_have;
u8 str_hash_type;
u8 encryption_type;

View File

@ -2,7 +2,7 @@
#define _BCACHEFS_FORMAT_H
/*
* Bcache on disk data structures
* bcachefs on disk data structures
*/
#include <asm/types.h>
@ -714,6 +714,25 @@ struct bch_xattr {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(xattr, BCH_XATTR);
/* Bucket/allocation information: */
enum {
BCH_ALLOC = 128,
};
enum {
BCH_ALLOC_FIELD_READ_TIME = 0,
BCH_ALLOC_FIELD_WRITE_TIME = 1,
};
struct bch_alloc {
struct bch_val v;
__u8 fields;
__u8 gen;
__u8 data[];
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(alloc, BCH_ALLOC);
/* Superblock */
/* Version 0: Cache device
@ -752,8 +771,7 @@ struct bch_member {
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8)
LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9)
LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10)
/* 8-10 unused, was HAS_(META)DATA */
LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15);
@ -800,7 +818,8 @@ enum bch_sb_field_type {
BCH_SB_FIELD_journal = 0,
BCH_SB_FIELD_members = 1,
BCH_SB_FIELD_crypt = 2,
BCH_SB_FIELD_NR = 3,
BCH_SB_FIELD_replicas = 3,
BCH_SB_FIELD_NR = 4,
};
struct bch_sb_field_journal {
@ -861,8 +880,24 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
struct bch_sb_field_replication {
enum bch_data_types {
BCH_DATA_NONE = 0,
BCH_DATA_SB = 1,
BCH_DATA_JOURNAL = 2,
BCH_DATA_BTREE = 3,
BCH_DATA_USER = 4,
BCH_DATA_NR = 5,
};
struct bch_replicas_entry {
u8 data_type;
u8 nr;
u8 devs[0];
};
struct bch_sb_field_replicas {
struct bch_sb_field field;
struct bch_replicas_entry entries[0];
};
/*
@ -937,8 +972,7 @@ LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64);
/* 56-64 unused, was REPLICAS_HAVE */
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8);
@ -946,6 +980,7 @@ LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
/* 14-20 unused, was JOURNAL_ENTRY_SIZE */
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
@ -1003,77 +1038,6 @@ enum bch_compression_opts {
BCH_COMPRESSION_NR = 3,
};
/* backing device specific stuff: */
struct backingdev_sb {
__le64 csum;
__le64 offset; /* sector where this sb was written */
__le64 version; /* of on disk format */
uuid_le magic; /* bcachefs superblock UUID */
uuid_le disk_uuid;
/*
* Internal cache set UUID - xored with various magic numbers and thus
* must never change:
*/
union {
uuid_le set_uuid;
__le64 set_magic;
};
__u8 label[BCH_SB_LABEL_SIZE];
__le64 flags;
/* Incremented each time superblock is written: */
__le64 seq;
/*
* User visible UUID for identifying the cache set the user is allowed
* to change:
*
* XXX hooked up?
*/
uuid_le user_uuid;
__le64 pad1[6];
__le64 data_offset;
__le16 block_size; /* sectors */
__le16 pad2[3];
__le32 last_mount; /* time_t */
__le16 pad3;
/* size of variable length portion - always 0 for backingdev superblock */
__le16 u64s;
__u64 _data[0];
};
LE64_BITMASK(BDEV_CACHE_MODE, struct backingdev_sb, flags, 0, 4);
#define CACHE_MODE_WRITETHROUGH 0U
#define CACHE_MODE_WRITEBACK 1U
#define CACHE_MODE_WRITEAROUND 2U
#define CACHE_MODE_NONE 3U
LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63);
#define BDEV_STATE_NONE 0U
#define BDEV_STATE_CLEAN 1U
#define BDEV_STATE_DIRTY 2U
#define BDEV_STATE_STALE 3U
#define BDEV_DATA_START_DEFAULT 16 /* sectors */
static inline _Bool __SB_IS_BDEV(__u64 version)
{
return version == BCACHE_SB_VERSION_BDEV
|| version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
}
static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
{
return __SB_IS_BDEV(sb->version);
}
/*
* Magic numbers
*
@ -1088,7 +1052,6 @@ static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
#define BCACHE_STATFS_MAGIC 0xca451a4e
#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
@ -1103,11 +1066,6 @@ static inline __u64 __jset_magic(struct bch_sb *sb)
return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
}
static inline __u64 __pset_magic(struct bch_sb *sb)
{
return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
}
static inline __u64 __bset_magic(struct bch_sb *sb)
{
return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
@ -1136,9 +1094,9 @@ struct jset_entry {
LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8);
enum {
JOURNAL_ENTRY_BTREE_KEYS = 0,
JOURNAL_ENTRY_BTREE_ROOT = 1,
JOURNAL_ENTRY_PRIO_PTRS = 2,
JOURNAL_ENTRY_BTREE_KEYS = 0,
JOURNAL_ENTRY_BTREE_ROOT = 1,
JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */
/*
* Journal sequence numbers can be blacklisted: bsets record the max
@ -1150,7 +1108,7 @@ enum {
* and then record that we skipped it so that the next time we crash and
* recover we don't think there was a missing journal entry.
*/
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
};
/*
@ -1193,35 +1151,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
#define BCH_JOURNAL_BUCKETS_MIN 20
/* Bucket prios/gens */
struct prio_set {
struct bch_csum csum;
__le64 magic;
__le32 nonce[3];
__le16 version;
__le16 flags;
__u8 encrypted_start[0];
__le64 next_bucket;
struct bucket_disk {
__le16 prio[2];
__u8 gen;
} __attribute__((packed)) data[];
} __attribute__((packed, aligned(8)));
LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
/* Btree: */
#define DEFINE_BCH_BTREE_IDS() \
DEF_BTREE_ID(EXTENTS, 0, "extents") \
DEF_BTREE_ID(INODES, 1, "inodes") \
DEF_BTREE_ID(DIRENTS, 2, "dirents") \
DEF_BTREE_ID(XATTRS, 3, "xattrs")
DEF_BTREE_ID(EXTENTS, 0, "extents") \
DEF_BTREE_ID(INODES, 1, "inodes") \
DEF_BTREE_ID(DIRENTS, 2, "dirents") \
DEF_BTREE_ID(XATTRS, 3, "xattrs") \
DEF_BTREE_ID(ALLOC, 4, "alloc")
#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
@ -1318,4 +1255,33 @@ struct btree_node_entry {
};
} __attribute__((packed, aligned(8)));
/* Obsolete: */
struct prio_set {
struct bch_csum csum;
__le64 magic;
__le32 nonce[3];
__le16 version;
__le16 flags;
__u8 encrypted_start[0];
__le64 next_bucket;
struct bucket_disk {
__le16 prio[2];
__u8 gen;
} __attribute__((packed)) data[];
} __attribute__((packed, aligned(8)));
LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
static inline __u64 __pset_magic(struct bch_sb *sb)
{
return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
}
#endif /* _BCACHEFS_FORMAT_H */

View File

@ -1,13 +1,9 @@
#ifndef _LINUX_BCACHE_IOCTL_H
#define _LINUX_BCACHE_IOCTL_H
#ifndef _BCACHEFS_IOCTL_H
#define _BCACHEFS_IOCTL_H
#include <linux/uuid.h>
#include "bcachefs_format.h"
#ifdef __cplusplus
extern "C" {
#endif
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
@ -97,8 +93,4 @@ struct bch_ioctl_data {
__u64 end_offset;
};
#ifdef __cplusplus
}
#endif
#endif /* _LINUX_BCACHE_IOCTL_H */
#endif /* _BCACHEFS_IOCTL_H */

View File

@ -580,6 +580,8 @@ BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT);
BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
/* byte order helpers */
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
@ -13,6 +14,7 @@ const struct bkey_ops *bch2_bkey_ops[] = {
[BKEY_TYPE_INODES] = &bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops,
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
};

View File

@ -129,6 +129,8 @@ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
enum bch_data_types data_type = type == BKEY_TYPE_BTREE
? BCH_DATA_BTREE : BCH_DATA_USER;
int ret = 0;
switch (k.k->type) {
@ -137,6 +139,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
(!c->opts.nofsck &&
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
"superblock not marked as containing replicas"))) {
ret = bch2_check_mark_super(c, e, data_type);
if (ret)
return ret;
}
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev];
struct bucket *g = PTR_BUCKET(ca, ptr);
@ -147,7 +158,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
new.gen = ptr->gen;
new.gen_valid = 1;
}));
ca->need_prio_write = true;
ca->need_alloc_write = true;
}
if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
@ -159,7 +170,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
new.gen = ptr->gen;
new.gen_valid = 1;
}));
ca->need_prio_write = true;
ca->need_alloc_write = true;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
}
@ -168,6 +179,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
}
}
atomic64_set(&c->key_version,
max_t(u64, k.k->version.lo,
atomic64_read(&c->key_version)));
@ -348,17 +360,6 @@ void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
}
spin_unlock(&c->journal.lock);
spin_lock(&ca->prio_buckets_lock);
for (i = 0; i < prio_buckets(ca) * 2; i++) {
b = ca->prio_buckets[i];
if (b)
bch2_mark_metadata_bucket(ca, ca->buckets + b,
BUCKET_PRIOS, true);
}
spin_unlock(&ca->prio_buckets_lock);
}
static void bch2_mark_metadata(struct bch_fs *c)
@ -474,10 +475,6 @@ void bch2_gc(struct bch_fs *c)
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return;
trace_gc_start(c);
/*
@ -487,6 +484,8 @@ void bch2_gc(struct bch_fs *c)
bch2_recalc_sectors_available(c);
down_write(&c->gc_lock);
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
goto out;
bch2_gc_start(c);
@ -502,8 +501,7 @@ void bch2_gc(struct bch_fs *c)
if (ret) {
bch_err(c, "btree gc failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
up_write(&c->gc_lock);
return;
goto out;
}
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
@ -518,7 +516,7 @@ void bch2_gc(struct bch_fs *c)
/* Indicates that gc is no longer in progress: */
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
c->gc_count++;
out:
up_write(&c->gc_lock);
trace_gc_end(c);
bch2_time_stats_update(&c->btree_gc_time, start_time);
@ -529,6 +527,12 @@ void bch2_gc(struct bch_fs *c)
*/
for_each_member_device(ca, c, i)
bch2_wake_allocator(ca);
/*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
}
/* Btree coalescing */
@ -997,6 +1001,14 @@ int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
unsigned iter = 0;
enum btree_id id;
int ret;
mutex_lock(&c->sb_lock);
if (!bch2_sb_get_replicas(c->disk_sb)) {
if (BCH_SB_INITIALIZED(c->disk_sb))
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
mutex_unlock(&c->sb_lock);
again:
bch2_gc_start(c);
@ -1006,11 +1018,9 @@ again:
return ret;
}
if (journal) {
ret = bch2_journal_mark(c, journal);
if (ret)
return ret;
}
ret = bch2_journal_mark(c, journal);
if (ret)
return ret;
bch2_mark_metadata(c);

View File

@ -1402,7 +1402,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
if (ret)
bch2_fatal_error(c);
bch2_inconsistent_error(c);
return ret;
}

View File

@ -233,17 +233,29 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
bool use_reserve,
struct disk_reservation *res,
struct closure *cl)
struct disk_reservation *res,
struct closure *cl,
unsigned flags)
{
BKEY_PADDED(k) tmp;
struct open_bucket *ob;
struct btree *b;
unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
nr_reserve = 0;
alloc_reserve = RESERVE_ALLOC;
} else if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = BTREE_NODE_RESERVE / 2;
alloc_reserve = RESERVE_BTREE;
} else {
nr_reserve = BTREE_NODE_RESERVE;
alloc_reserve = RESERVE_NONE;
}
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > reserve) {
if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
@ -263,8 +275,7 @@ retry:
bkey_i_to_extent(&tmp.k),
res->nr_replicas,
c->opts.metadata_replicas_required,
use_reserve ? RESERVE_BTREE : RESERVE_NONE,
cl);
alloc_reserve, cl);
if (IS_ERR(ob))
return ERR_CAST(ob);
@ -311,7 +322,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
bch2_btree_build_aux_trees(b);
bch2_check_mark_super(c, &b->key, true);
bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE);
trace_btree_node_alloc(c, b);
return b;
@ -533,9 +544,6 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
if (flags & BTREE_INSERT_NOFAIL)
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
if (flags & BTREE_INSERT_NOWAIT)
cl = NULL;
/*
* This check isn't necessary for correctness - it's just to potentially
* prevent us from doing a lot of work that'll end up being wasted:
@ -565,8 +573,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
reserve->nr = 0;
while (reserve->nr < nr_nodes) {
b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
&disk_res, cl);
b = __bch2_btree_node_alloc(c, &disk_res,
flags & BTREE_INSERT_NOWAIT
? NULL : cl, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err_free;
@ -793,8 +802,8 @@ void bch2_btree_journal_key(struct btree_insert *trans,
struct btree_write *w = btree_current_write(b);
EBUG_ON(iter->level || b->level);
EBUG_ON(!trans->journal_res.ref &&
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (!journal_pin_active(&w->journal))
bch2_journal_pin_add(j, &trans->journal_res,
@ -1026,6 +1035,27 @@ retry:
*/
six_unlock_read(&b->lock);
mutex_unlock(&c->btree_interior_update_lock);
/*
* Bit of funny circularity going on here we have to break:
*
* We have to drop our journal pin before writing the journal
* entry that points to the new btree root: else, we could
* deadlock if the journal currently happens to be full.
*
* This mean we're dropping the journal pin _before_ the new
* nodes are technically reachable - but this is safe, because
* after the bch2_btree_set_root_ondisk() call above they will
* be reachable as of the very next journal write:
*/
bch2_journal_pin_drop(&c->journal, &as->journal);
/*
* And, do a journal write to write the pointer to the new root,
* then wait for it to complete before freeing the nodes we
* replaced:
*/
bch2_journal_meta_async(&c->journal, cl);
break;
}
@ -1051,19 +1081,70 @@ static void btree_interior_update_updated_btree(struct bch_fs *c,
mutex_unlock(&c->btree_interior_update_lock);
/*
* In general, when you're staging things in a journal that will later
* be written elsewhere, and you also want to guarantee ordering: that
* is, if you have updates a, b, c, after a crash you should never see c
* and not a or b - there's a problem:
*
* If the final destination of the update(s) (i.e. btree node) can be
* written/flushed _before_ the relevant journal entry - oops, that
* breaks ordering, since the various leaf nodes can be written in any
* order.
*
* Normally we use bset->journal_seq to deal with this - if during
* recovery we find a btree node write that's newer than the newest
* journal entry, we just ignore it - we don't need it, anything we're
* supposed to have (that we reported as completed via fsync()) will
* still be in the journal, and as far as the state of the journal is
* concerned that btree node write never happened.
*
* That breaks when we're rewriting/splitting/merging nodes, since we're
* mixing btree node writes that haven't happened yet with previously
* written data that has been reported as completed to the journal.
*
* Thus, before making the new nodes reachable, we have to wait the
* newest journal sequence number we have data for to be written (if it
* hasn't been yet).
*/
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
continue_at(&as->cl, btree_interior_update_nodes_written,
system_freezable_wq);
}
static void btree_interior_update_reparent(struct btree_interior_update *as,
static void interior_update_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
struct btree_interior_update *as =
container_of(pin, struct btree_interior_update, journal);
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
}
static void btree_interior_update_reparent(struct bch_fs *c,
struct btree_interior_update *as,
struct btree_interior_update *child)
{
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
child->parent_as = as;
closure_get(&as->cl);
/*
* When we write a new btree root, we have to drop our journal pin
* _before_ the new nodes are technically reachable; see
* btree_interior_update_nodes_written().
*
* This goes for journal pins that are recursively blocked on us - so,
* just transfer the journal pin to the new interior update so
* btree_interior_update_nodes_written() can drop it.
*/
bch2_journal_pin_add_if_older(&c->journal, &child->journal,
&as->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &child->journal);
as->journal_seq = max(as->journal_seq, child->journal_seq);
}
static void btree_interior_update_updated_root(struct bch_fs *c,
@ -1081,7 +1162,7 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
* btree_interior_update operation to point to us:
*/
if (r->as)
btree_interior_update_reparent(as, r->as);
btree_interior_update_reparent(c, as, r->as);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->b = r->b;
@ -1089,19 +1170,21 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
mutex_unlock(&c->btree_interior_update_lock);
/*
* When we're rewriting nodes and updating interior nodes, there's an
* issue with updates that haven't been written in the journal getting
* mixed together with older data - see * btree_interior_update_updated_btree()
* for the explanation.
*
* However, this doesn't affect us when we're writing a new btree root -
* because to make that new root reachable we have to write out a new
* journal entry, which must necessarily be newer than as->journal_seq.
*/
continue_at(&as->cl, btree_interior_update_nodes_written,
system_freezable_wq);
}
static void interior_update_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
struct btree_interior_update *as =
container_of(pin, struct btree_interior_update, journal);
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
}
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_interior_updates - redirect @b's
@ -1150,7 +1233,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del(&p->write_blocked_list);
btree_interior_update_reparent(as, p);
btree_interior_update_reparent(c, as, p);
}
clear_btree_node_dirty(b);

View File

@ -373,16 +373,20 @@ int __bch2_btree_insert_at(struct btree_insert *);
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << 2)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
/*
* Insert is for journal replay: don't get journal reservations, or mark extents
* (bch_mark_key)
*/
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << 4)
#define BTREE_INSERT_GC_LOCK_HELD (1 << 5)
#define BTREE_INSERT_NOWAIT (1 << 5)
#define BTREE_INSERT_GC_LOCK_HELD (1 << 6)
#define BCH_HASH_SET_MUST_CREATE (1 << 7)
#define BCH_HASH_SET_MUST_REPLACE (1 << 8)
int bch2_btree_delete_at(struct btree_iter *, unsigned);

View File

@ -306,14 +306,18 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
_old; \
})
void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
struct bucket_mark *old)
{
struct bch_fs_usage stats = { 0 };
struct bucket_mark old, new;
struct bucket_mark new;
*old = bucket_data_cmpxchg(ca, g, new, ({
if (!is_available_bucket(new))
return false;
old = bucket_data_cmpxchg(ca, g, new, ({
new.owned_by_allocator = 1;
new.had_metadata = 0;
new.touched_this_mount = 1;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
@ -321,11 +325,28 @@ void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
}));
/* XXX: we're not actually updating fs usage's cached sectors... */
bch2_fs_usage_update(&stats, old, new);
bch2_fs_usage_update(&stats, *old, new);
if (!old.owned_by_allocator && old.cached_sectors)
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, g - ca->buckets,
old.cached_sectors);
old->cached_sectors);
return true;
}
bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
{
struct bucket_mark new, old;
old = bucket_data_cmpxchg(ca, g, new, ({
if (new.touched_this_mount ||
!is_available_bucket(new))
return false;
new.owned_by_allocator = 1;
new.touched_this_mount = 1;
}));
return true;
}
void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
@ -333,6 +354,7 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
struct bucket_mark old, new;
old = bucket_data_cmpxchg(ca, g, new, ({
new.touched_this_mount = 1;
new.owned_by_allocator = 0;
new.data_type = 0;
new.cached_sectors = 0;
@ -348,7 +370,8 @@ void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
struct bucket_mark new;
bucket_data_cmpxchg(ca, g, new, ({
new.owned_by_allocator = owned_by_allocator;
new.touched_this_mount = 1;
new.owned_by_allocator = owned_by_allocator;
}));
}
@ -376,8 +399,8 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
old = bucket_data_cmpxchg(ca, g, new, ({
saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
GC_MAX_SECTORS_USED);
new.data_type = type;
new.had_metadata = 1;
new.data_type = type;
new.touched_this_mount = 1;
}));
if (old.data_type != type &&
@ -458,8 +481,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
if (gc_will_visit) {
if (journal_seq)
bucket_cmpxchg(g, new, ({
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
new.touched_this_mount = 1;
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}));
goto out;
@ -479,11 +503,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
return;
}
EBUG_ON(type != S_CACHED &&
!may_make_unavailable &&
is_available_bucket(new) &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
if (type != S_CACHED &&
new.dirty_sectors == GC_MAX_SECTORS_USED &&
disk_sectors < 0)
@ -508,7 +527,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
new.data_type = data_type;
}
new.had_metadata |= is_meta_bucket(new);
new.touched_this_mount = 1;
}));
if (old.data_type != data_type &&

View File

@ -191,7 +191,9 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_invalidate_bucket(struct bch_dev *, struct bucket *);
bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
struct bucket_mark *);
bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,

View File

@ -3,6 +3,7 @@
#include "util.h"
/* kill, switch to bch_data_types */
enum bucket_data_type {
BUCKET_DATA = 0,
BUCKET_BTREE,
@ -19,23 +20,12 @@ struct bucket_mark {
struct {
u8 gen;
unsigned gen_valid:1;
unsigned journal_seq_valid:1;
/*
* If this bucket had metadata while at the current generation
* number, the allocator must increment its gen before we reuse
* it:
*/
unsigned had_metadata:1;
unsigned owned_by_allocator:1;
unsigned data_type:3;
unsigned nouse:1;
u8 data_type:3,
gen_valid:1,
owned_by_allocator:1,
nouse:1,
journal_seq_valid:1,
touched_this_mount:1;
u16 dirty_sectors;
u16 cached_sectors;

View File

@ -412,9 +412,6 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
size_ondisk > ca->mi.bucket_size)
return "spans multiple buckets";
if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
return "device not marked as containing data";
return NULL;
}
@ -547,12 +544,12 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
goto err;
}
if (replicas < c->sb.meta_replicas_have) {
if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
bch2_fs_bug(c,
"btree key bad (too few replicas, %u < %u): %s",
replicas, c->sb.meta_replicas_have, buf);
"btree key bad (replicas not marked in superblock):\n%s",
buf);
return;
}
@ -1755,12 +1752,12 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
}
if (!bkey_extent_is_cached(e.k) &&
replicas < c->sb.data_replicas_have) {
bch2_bkey_val_to_text(c, btree_node_type(b), buf,
sizeof(buf), e.s_c);
!bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), e.s_c);
bch2_fs_bug(c,
"extent key bad (too few replicas, %u < %u): %s",
replicas, c->sb.data_replicas_have, buf);
"extent key bad (replicas not marked in superblock):\n%s",
buf);
return;
}

View File

@ -531,7 +531,8 @@ static int bch2_write_extent(struct bch_write_op *op,
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
bch2_check_mark_super(c, key_to_write, false);
bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
BCH_DATA_USER);
bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
return ret;

View File

@ -53,28 +53,6 @@ static inline u64 journal_pin_seq(struct journal *j,
return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
}
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
while (entry < vstruct_last(jset)) {
if (JOURNAL_ENTRY_TYPE(entry) == type)
return entry;
entry = vstruct_next(entry);
}
return NULL;
}
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
vstruct_for_each_safe(entry, k, _n)
static inline void bch2_journal_add_entry(struct journal_buf *buf,
const void *data, size_t u64s,
unsigned type, enum btree_id id,
@ -123,20 +101,6 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
JOURNAL_ENTRY_BTREE_ROOT, id, level);
}
static inline void bch2_journal_add_prios(struct journal *j,
struct journal_buf *buf)
{
/*
* no prio bucket ptrs yet... XXX should change the allocator so this
* can't happen:
*/
if (!buf->nr_prio_buckets)
return;
bch2_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
}
static void journal_seq_blacklist_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
@ -986,7 +950,6 @@ static inline bool journal_has_keys(struct list_head *list)
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
struct jset_entry *prio_ptrs;
struct journal_list jlist;
struct journal_replay *i;
struct journal_entry_pin_list *p;
@ -1094,15 +1057,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
keys, entries, (u64) atomic64_read(&j->seq));
i = list_last_entry(list, struct journal_replay, list);
prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0);
if (prio_ptrs) {
memcpy_u64s(j->prio_buckets,
prio_ptrs->_data,
le16_to_cpu(prio_ptrs->u64s));
j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
}
fsck_err:
return ret;
}
@ -1189,12 +1143,7 @@ static void __bch2_journal_next_entry(struct journal *j)
static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
{
unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
if (buf->nr_prio_buckets)
ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
return ret;
return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
}
static enum {
@ -1395,9 +1344,7 @@ static int journal_entry_open(struct journal *j)
buf->disk_sectors = sectors;
sectors = min_t(unsigned, sectors, buf->size >> 9);
j->cur_buf_sectors = sectors;
buf->nr_prio_buckets = j->nr_prio_buckets;
u64s = (sectors << 9) / sizeof(u64);
@ -1510,17 +1457,27 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
for_each_jset_key(k, _n, entry, &i->j) {
struct disk_reservation disk_res;
/*
* We might cause compressed extents to be split, so we
* need to pass in a disk_reservation:
*/
BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
if (entry->btree_id == BTREE_ID_ALLOC) {
/*
* allocation code handles replay for
* BTREE_ID_ALLOC keys:
*/
ret = bch2_alloc_replay_key(c, k->k.p);
} else {
ret = bch2_btree_insert(c, entry->btree_id, k,
&disk_res, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY);
bch2_disk_reservation_put(c, &disk_res);
/*
* We might cause compressed extents to be
* split, so we need to pass in a
* disk_reservation:
*/
BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
ret = bch2_btree_insert(c, entry->btree_id, k,
&disk_res, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_REPLAY);
bch2_disk_reservation_put(c, &disk_res);
}
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
@ -1560,13 +1517,12 @@ err:
return ret;
}
#if 0
/*
* Allocate more journal space at runtime - not currently making use if it, but
* the code works:
*/
static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
unsigned nr)
unsigned nr)
{
struct journal *j = &c->journal;
struct journal_device *ja = &ca->journal;
@ -1614,8 +1570,8 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
while (ja->nr < nr) {
/* must happen under journal lock, to avoid racing with gc: */
u64 b = bch2_bucket_alloc(ca, RESERVE_NONE);
if (!b) {
long b = bch2_bucket_alloc(c, ca, RESERVE_NONE);
if (b < 0) {
if (!closure_wait(&c->freelist_wait, &cl)) {
spin_unlock(&j->lock);
closure_sync(&cl);
@ -1651,7 +1607,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
}
spin_unlock(&j->lock);
BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
bch2_write_super(c);
@ -1663,16 +1619,15 @@ err:
kfree(new_buckets);
bch2_disk_reservation_put(c, &disk_res);
if (!ret)
bch2_dev_allocator_add(c, ca);
return ret;
}
#endif
int bch2_dev_journal_alloc(struct bch_dev *ca)
{
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets;
unsigned i, nr;
u64 b, *p;
unsigned nr;
if (dynamic_fault("bcachefs:add:journal_alloc"))
return -ENOMEM;
@ -1686,45 +1641,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
min(1 << 10,
(1 << 20) / ca->mi.bucket_size));
p = krealloc(ja->bucket_seq, nr * sizeof(u64),
GFP_KERNEL|__GFP_ZERO);
if (!p)
return -ENOMEM;
ja->bucket_seq = p;
p = krealloc(ja->buckets, nr * sizeof(u64),
GFP_KERNEL|__GFP_ZERO);
if (!p)
return -ENOMEM;
ja->buckets = p;
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
nr + sizeof(*journal_buckets) / sizeof(u64));
if (!journal_buckets)
return -ENOMEM;
for (i = 0, b = ca->mi.first_bucket;
i < nr && b < ca->mi.nbuckets; b++) {
if (!is_available_bucket(ca->buckets[b].mark))
continue;
bch2_mark_metadata_bucket(ca, &ca->buckets[b],
BUCKET_JOURNAL, true);
ja->buckets[i] = b;
journal_buckets->buckets[i] = cpu_to_le64(b);
i++;
}
if (i < nr)
return -ENOSPC;
BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi));
ja->nr = nr;
return 0;
return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
}
/* Journalling */
@ -2274,9 +2191,6 @@ static void journal_write(struct closure *cl)
jset = w->data;
j->write_start_time = local_clock();
bch2_journal_add_prios(j, w);
mutex_lock(&c->btree_root_lock);
for (i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = &c->btree_roots[i];
@ -2324,7 +2238,8 @@ static void journal_write(struct closure *cl)
closure_return_with_destructor(cl, journal_write_done);
}
bch2_check_mark_super(c, &j->key, true);
bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
BCH_DATA_JOURNAL);
/*
* XXX: we really should just disable the entire journal in nochanges
@ -2380,7 +2295,7 @@ no_io:
closure_return_with_destructor(cl, journal_write_done);
err:
bch2_fatal_error(c);
bch2_inconsistent_error(c);
closure_return_with_destructor(cl, journal_write_done);
}

View File

@ -121,6 +121,28 @@ struct journal_replay {
struct jset j;
};
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
while (entry < vstruct_last(jset)) {
if (JOURNAL_ENTRY_TYPE(entry) == type)
return entry;
entry = vstruct_next(entry);
}
return NULL;
}
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
vstruct_for_each_safe(entry, k, _n)
#define JOURNAL_PIN (32 * 1024)
static inline bool journal_pin_active(struct journal_entry_pin *pin)

View File

@ -20,13 +20,6 @@ struct journal_buf {
unsigned size;
unsigned disk_sectors;
/*
* ugh, prio_buckets are stupid - need to convert them to new
* transaction machinery when it arrives
*/
unsigned nr_prio_buckets;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
@ -189,14 +182,6 @@ struct journal {
/* protects advancing ja->last_idx: */
struct mutex reclaim_lock;
/*
* ugh: need to get prio_buckets converted over to the eventual new
* transaction machinery
*/
__le64 prio_buckets[BCH_SB_MEMBERS_MAX];
unsigned nr_prio_buckets;
unsigned write_delay_ms;
unsigned reclaim_delay_ms;

View File

@ -59,16 +59,18 @@ int bch2_move_data_off_device(struct bch_dev *ca)
{
struct moving_context ctxt;
struct bch_fs *c = ca->fs;
struct bch_sb_field_members *mi;
unsigned pass = 0;
u64 seen_key_count;
int ret = 0;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
if (!ca->mi.has_data)
if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
return 0;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
ctxt.avoid = ca;
@ -124,7 +126,11 @@ int bch2_move_data_off_device(struct bch_dev *ca)
BUG_ON(ret);
seen_key_count++;
continue;
next:
if (bkey_extent_is_data(k.k))
bch2_check_mark_super(c, bkey_s_c_to_extent(k),
BCH_DATA_USER);
bch2_btree_iter_advance_pos(&iter);
bch2_btree_iter_cond_resched(&iter);
@ -133,23 +139,20 @@ next:
bch2_move_ctxt_exit(&ctxt);
if (ret)
return ret;
goto err;
} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
if (seen_key_count) {
pr_err("Unable to migrate all data in %d iterations.",
MAX_DATA_OFF_ITER);
return -1;
ret = -1;
goto err;
}
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
err:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
/*
@ -245,21 +248,27 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
int bch2_move_metadata_off_device(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct bch_sb_field_members *mi;
unsigned i;
int ret;
int ret = 0;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
if (!ca->mi.has_metadata)
if (!(bch2_dev_has_data(c, ca) &
((1 << BCH_DATA_JOURNAL)|
(1 << BCH_DATA_BTREE))))
return 0;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c,
(1 << BCH_DATA_JOURNAL)|
(1 << BCH_DATA_BTREE));
/* 1st, Move the btree nodes off the device */
for (i = 0; i < BTREE_ID_NR; i++) {
ret = bch2_move_btree_off(c, ca, i);
if (ret)
return ret;
goto err;
}
/* There are no prios/gens to move -- they are already in the device. */
@ -268,16 +277,12 @@ int bch2_move_metadata_off_device(struct bch_dev *ca)
ret = bch2_journal_move(ca);
if (ret)
return ret;
goto err;
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
err:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
/*
@ -326,12 +331,16 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
*/
int bch2_flag_data_bad(struct bch_dev *ca)
{
int ret = 0;
struct bch_fs *c = ca->fs;
struct bkey_s_c k;
struct bkey_s_c_extent e;
struct btree_iter iter;
int ret = 0;
bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS,
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
POS_MIN, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
@ -377,10 +386,16 @@ int bch2_flag_data_bad(struct bch_dev *ca)
*/
continue;
advance:
if (bkey_extent_is_data(k.k))
bch2_check_mark_super(c, bkey_s_c_to_extent(k),
BCH_DATA_USER);
bch2_btree_iter_advance_pos(&iter);
}
bch2_btree_iter_unlock(&iter);
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}

View File

@ -59,6 +59,8 @@ enum opt_type {
s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\
s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
BCH_OPT(degraded, 0444, NO_SB_OPT, \
s8, OPT_BOOL()) \
BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \
s8, OPT_STR(bch2_csum_types)) \
BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \

View File

@ -267,9 +267,6 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
}
}
#define BCH_HASH_SET_MUST_CREATE (1 << 4)
#define BCH_HASH_SET_MUST_REPLACE (1 << 5)
static inline int bch2_hash_set(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct bch_fs *c, u64 inode,

View File

@ -11,6 +11,9 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
static const char *bch2_sb_validate_replicas(struct bch_sb *);
static inline void __bch2_sb_layout_size_assert(void)
{
BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
@ -228,8 +231,8 @@ static int u64_cmp(const void *_l, const void *_r)
return l < r ? -1 : l > r ? 1 : 0;
}
const char *bch2_validate_journal_layout(struct bch_sb *sb,
struct bch_member_cpu mi)
const char *bch2_sb_validate_journal(struct bch_sb *sb,
struct bch_member_cpu mi)
{
struct bch_sb_field_journal *journal;
const char *err;
@ -291,7 +294,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb)
return "Invalid superblock: bad member info";
for (i = 0; i < sb->nr_devices; i++) {
if (bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
if (!bch2_dev_exists(sb, mi, i))
continue;
if (le16_to_cpu(mi->members[i].bucket_size) <
@ -302,7 +305,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb)
return NULL;
}
const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
@ -347,11 +350,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
BCH_SB_META_REPLICAS_HAVE(sb) >
BCH_SB_META_REPLICAS_WANT(sb))
return "Invalid number of metadata replicas";
if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of data replicas";
@ -360,11 +358,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
BCH_SB_DATA_REPLICAS_HAVE(sb) >
BCH_SB_DATA_REPLICAS_WANT(sb))
return "Invalid number of data replicas";
if (!BCH_SB_BTREE_NODE_SIZE(sb))
return "Btree node size not set";
@ -419,7 +412,11 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb)
mi.bucket_size * mi.nbuckets)
return "Invalid superblock: device too small";
err = bch2_validate_journal_layout(sb, mi);
err = bch2_sb_validate_journal(sb, mi);
if (err)
return err;
err = bch2_sb_validate_replicas(sb);
if (err)
return err;
@ -464,8 +461,6 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src);
c->sb.nr_devices = src->nr_devices;
c->sb.clean = BCH_SB_CLEAN(src);
c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src);
c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
@ -517,6 +512,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
unsigned journal_u64s = journal_buckets
? le32_to_cpu(journal_buckets->field.u64s)
: 0;
int ret;
lockdep_assert_held(&c->sb_lock);
@ -524,8 +520,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
return -ENOMEM;
__copy_super(c->disk_sb, src);
bch2_sb_update(c);
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret)
return ret;
bch2_sb_update(c);
return 0;
}
@ -743,6 +743,7 @@ void bch2_write_super(struct bch_fs *c)
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
unsigned i, super_idx = 0;
const char *err;
bool wrote;
lockdep_assert_held(&c->sb_lock);
@ -754,7 +755,16 @@ void bch2_write_super(struct bch_fs *c)
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
if (c->opts.nochanges)
for_each_online_member(ca, c, i) {
err = bch2_sb_validate(&ca->disk_sb);
if (err) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
goto out;
}
}
if (c->opts.nochanges ||
test_bit(BCH_FS_ERROR, &c->flags))
goto out;
do {
@ -771,40 +781,482 @@ out:
bch2_sb_update(c);
}
void bch2_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k,
bool meta)
/* replica information: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
unsigned *max_dev)
{
struct bch_replicas_entry *i;
unsigned j;
*nr = 0;
*bytes = sizeof(*r);
*max_dev = 0;
if (!r)
return;
for_each_replicas_entry(r, i) {
for (j = 0; j < i->nr; j++)
*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
(*nr)++;
}
*bytes = (void *) i - (void *) r;
}
static struct bch_replicas_cpu *
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
{
struct bch_replicas_cpu *cpu_r;
unsigned i, nr, bytes, max_dev, entry_size;
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!cpu_r)
return NULL;
cpu_r->nr = nr;
cpu_r->entry_size = entry_size;
if (nr) {
struct bch_replicas_cpu_entry *dst =
cpu_replicas_entry(cpu_r, 0);
struct bch_replicas_entry *src = sb_r->entries;
while (dst < cpu_replicas_entry(cpu_r, nr)) {
dst->data_type = src->data_type;
for (i = 0; i < src->nr; i++)
replicas_set_dev(dst, src->devs[i]);
src = replicas_entry_next(src);
dst = (void *) dst + entry_size;
}
}
eytzinger0_sort(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
return cpu_r;
}
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r;
lockdep_assert_held(&c->sb_lock);
sb_r = bch2_sb_get_replicas(c->disk_sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
return -ENOMEM;
old_r = c->replicas;
rcu_assign_pointer(c->replicas, cpu_r);
if (old_r)
kfree_rcu(old_r, rcu);
return 0;
}
/*
* for when gc of replica information is in progress:
*/
static int bch2_update_gc_replicas(struct bch_fs *c,
struct bch_replicas_cpu *gc_r,
struct bkey_s_c_extent e,
enum bch_data_types data_type)
{
struct bch_member *mi;
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
unsigned nr_replicas = 0;
struct bch_replicas_cpu_entry *new_e;
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size, max_dev = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached)
max_dev = max_t(unsigned, max_dev, ptr->dev);
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, gc_r->entry_size);
nr = gc_r->nr + 1;
new = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!new)
return -ENOMEM;
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < gc_r->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(gc_r, i),
gc_r->entry_size);
new_e = cpu_replicas_entry(new, nr - 1);
new_e->data_type = data_type;
extent_for_each_ptr(e, ptr)
if (!ptr->cached)
replicas_set_dev(new_e, ptr->dev);
eytzinger0_sort(new->entries,
new->nr,
new->entry_size,
memcmp, NULL);
rcu_assign_pointer(c->replicas_gc, new);
kfree_rcu(gc_r, rcu);
return 0;
}
int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_types data_type)
{
struct bch_replicas_cpu *gc_r;
const struct bch_extent_ptr *ptr;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *new_entry;
unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
int ret = 0;
mutex_lock(&c->sb_lock);
/* recheck, might have raced */
if (bch2_check_super_marked(c, k, meta)) {
mutex_unlock(&c->sb_lock);
return;
gc_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (gc_r &&
!replicas_has_extent(gc_r, e, data_type)) {
ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
if (ret)
goto err;
}
mi = bch2_sb_get_members(c->disk_sb)->members;
/* recheck, might have raced */
if (bch2_sb_has_replicas(c, e, data_type)) {
mutex_unlock(&c->sb_lock);
return 0;
}
new_entry_bytes = sizeof(struct bch_replicas_entry) +
bch2_extent_nr_dirty_ptrs(e.s_c);
sb_r = bch2_sb_get_replicas(c->disk_sb);
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
sizeof(u64)));
if (!sb_r) {
ret = -ENOSPC;
goto err;
}
new_entry = (void *) sb_r + bytes;
new_entry->data_type = data_type;
new_entry->nr = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
(meta
? SET_BCH_MEMBER_HAS_METADATA
: SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
nr_replicas++;
}
if (!ptr->cached)
new_entry->devs[new_entry->nr++] = ptr->dev;
nr_replicas = min_t(unsigned, nr_replicas,
(meta
? BCH_SB_META_REPLICAS_HAVE
: BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb));
(meta
? SET_BCH_SB_META_REPLICAS_HAVE
: SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas);
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret) {
memset(new_entry, 0,
vstruct_end(&sb_r->field) - (void *) new_entry);
goto err;
}
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_dev *dev_to_offline)
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline;
struct replicas_status ret;
memset(&ret, 0, sizeof(ret));
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX;
rcu_read_lock();
r = rcu_dereference(c->replicas);
dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
for (i = 0; i < r->nr; i++) {
e = cpu_replicas_entry(r, i);
BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
nr_online = nr_offline = 0;
for (dev = 0; dev < dev_slots; dev++) {
if (!replicas_test_dev(e, dev))
continue;
if (bch2_dev_is_online(c->devs[dev]) &&
c->devs[dev] != dev_to_offline)
nr_online++;
else
nr_offline++;
}
ret.replicas[e->data_type].nr_online =
min(ret.replicas[e->data_type].nr_online,
nr_online);
ret.replicas[e->data_type].nr_offline =
max(ret.replicas[e->data_type].nr_offline,
nr_offline);
}
rcu_read_unlock();
return ret;
}
struct replicas_status bch2_replicas_status(struct bch_fs *c)
{
return __bch2_replicas_status(c, NULL);
}
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
{
struct replicas_status s = bch2_replicas_status(c);
return meta
? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
s.replicas[BCH_DATA_BTREE].nr_online)
: s.replicas[BCH_DATA_USER].nr_online;
}
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, ret = 0;
rcu_read_lock();
r = rcu_dereference(c->replicas);
if (ca->dev_idx >= replicas_dev_slots(r))
goto out;
for (i = 0; i < r->nr; i++) {
e = cpu_replicas_entry(r, i);
if (replicas_test_dev(e, ca->dev_idx)) {
ret |= 1 << e->data_type;
break;
}
}
out:
rcu_read_unlock();
return ret;
}
static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
{
struct bch_sb_field_members *mi;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
mi = bch2_sb_get_members(sb);
sb_r = bch2_sb_get_replicas(sb);
if (!sb_r)
return NULL;
for_each_replicas_entry(sb_r, e) {
err = "invalid replicas entry: invalid data type";
if (e->data_type >= BCH_DATA_NR)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *r, *old_r;
struct bch_replicas_entry *dst_e;
size_t i, j, bytes, dev_slots;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(r, rcu);
goto err;
}
dev_slots = replicas_dev_slots(r);
bytes = sizeof(struct bch_sb_field_replicas);
for (i = 0; i < r->nr; i++) {
struct bch_replicas_cpu_entry *e =
cpu_replicas_entry(r, i);
bytes += sizeof(struct bch_replicas_entry);
for (j = 0; j < r->entry_size - 1; j++)
bytes += hweight8(e->devs[j]);
}
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r) {
ret = -ENOSPC;
goto err;
}
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
(void *) &sb_r->entries);
dst_e = sb_r->entries;
for (i = 0; i < r->nr; i++) {
struct bch_replicas_cpu_entry *src_e =
cpu_replicas_entry(r, i);
dst_e->data_type = src_e->data_type;
for (j = 0; j < dev_slots; j++)
if (replicas_test_dev(src_e, j))
dst_e->devs[dst_e->nr++] = j;
dst_e = replicas_entry_next(dst_e);
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *r, *src;
unsigned i;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
r = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!r) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
r->entry_size = src->entry_size;
r->nr = 0;
for (i = 0; i < src->nr; i++) {
struct bch_replicas_cpu_entry *dst_e =
cpu_replicas_entry(r, r->nr);
struct bch_replicas_cpu_entry *src_e =
cpu_replicas_entry(src, i);
if (!(src_e->data_type & typemask)) {
memcpy(dst_e, src_e, r->entry_size);
r->nr++;
}
}
eytzinger0_sort(r->entries,
r->nr,
r->entry_size,
memcmp, NULL);
rcu_assign_pointer(c->replicas_gc, r);
mutex_unlock(&c->sb_lock);
return 0;
}

View File

@ -2,6 +2,7 @@
#define _BCACHE_SUPER_IO_H
#include "extents.h"
#include "eytzinger.h"
#include "super_types.h"
#include <asm/byteorder.h>
@ -40,6 +41,15 @@ bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \
BCH_SB_FIELD_TYPE(journal);
BCH_SB_FIELD_TYPE(members);
BCH_SB_FIELD_TYPE(crypt);
BCH_SB_FIELD_TYPE(replicas);
static inline bool bch2_dev_exists(struct bch_sb *sb,
struct bch_sb_field_members *mi,
unsigned dev)
{
return dev < sb->nr_devices &&
!bch2_is_zero(mi->members[dev].uuid.b, sizeof(uuid_le));
}
static inline bool bch2_sb_test_feature(struct bch_sb *sb,
enum bch_sb_features f)
@ -91,8 +101,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.bucket_size = le16_to_cpu(mi->bucket_size),
.state = BCH_MEMBER_STATE(mi),
.tier = BCH_MEMBER_TIER(mi),
.has_metadata = BCH_MEMBER_HAS_METADATA(mi),
.has_data = BCH_MEMBER_HAS_DATA(mi),
.replacement = BCH_MEMBER_REPLACEMENT(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
@ -105,55 +113,116 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bcache_superblock *);
int bch2_super_realloc(struct bcache_superblock *, unsigned);
const char *bch2_validate_journal_layout(struct bch_sb *,
const char *bch2_sb_validate_journal(struct bch_sb *,
struct bch_member_cpu);
const char *bch2_validate_cache_super(struct bcache_superblock *);
const char *bch2_sb_validate(struct bcache_superblock *);
const char *bch2_read_super(struct bcache_superblock *,
struct bch_opts, const char *);
void bch2_write_super(struct bch_fs *);
void bch2_check_mark_super_slowpath(struct bch_fs *,
const struct bkey_i *, bool);
static inline bool bch2_check_super_marked(struct bch_fs *c,
const struct bkey_i *k, bool meta)
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
}
static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
e->devs[dev >> 3] |= 1 << (dev & 7);
}
static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
{
return (r->entry_size -
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_types);
static inline bool replicas_has_extent(struct bch_replicas_cpu *r,
struct bkey_s_c_extent e,
enum bch_data_types data_type)
{
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
unsigned nr_replicas = 0;
bool ret = true;
struct bch_replicas_cpu_entry search = {
.data_type = data_type,
};
unsigned max_dev = 0;
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev];
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
if (ptr->cached)
continue;
if (!(meta
? ca->mi.has_metadata
: ca->mi.has_data)) {
ret = false;
break;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
max_dev = max_t(unsigned, max_dev, ptr->dev);
replicas_set_dev(&search, ptr->dev);
}
nr_replicas++;
}
return max_dev < replicas_dev_slots(r) &&
eytzinger0_find(r->entries, r->nr,
r->entry_size,
memcmp, &search) < r->nr;
}
if (nr_replicas <
(meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
ret = false;
static inline bool bch2_sb_has_replicas(struct bch_fs *c,
struct bkey_s_c_extent e,
enum bch_data_types data_type)
{
bool ret;
rcu_read_lock();
ret = replicas_has_extent(rcu_dereference(c->replicas),
e, data_type);
rcu_read_unlock();
return ret;
}
static inline void bch2_check_mark_super(struct bch_fs *c,
const struct bkey_i *k, bool meta)
static inline int bch2_check_mark_super(struct bch_fs *c,
struct bkey_s_c_extent e,
enum bch_data_types data_type)
{
if (bch2_check_super_marked(c, k, meta))
return;
struct bch_replicas_cpu *gc_r;
bool marked;
bch2_check_mark_super_slowpath(c, k, meta);
rcu_read_lock();
marked = replicas_has_extent(rcu_dereference(c->replicas),
e, data_type) &&
(!(gc_r = rcu_dereference(c->replicas_gc)) ||
replicas_has_extent(gc_r, e, data_type));
rcu_read_unlock();
if (marked)
return 0;
return bch2_check_mark_super_slowpath(c, e, data_type);
}
struct replicas_status {
struct {
unsigned nr_online;
unsigned nr_offline;
} replicas[BCH_DATA_NR];
};
struct replicas_status __bch2_replicas_status(struct bch_fs *,
struct bch_dev *);
struct replicas_status bch2_replicas_status(struct bch_fs *);
unsigned bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
#endif /* _BCACHE_SUPER_IO_H */

View File

@ -224,6 +224,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_dev_allocator_stop(ca);
bch2_fs_journal_stop(&c->journal);
for_each_member_device(ca, c, i)
bch2_dev_allocator_remove(c, ca);
}
static void bch2_writes_disabled(struct percpu_ref *writes)
@ -330,6 +333,10 @@ const char *bch2_fs_read_write(struct bch_fs *c)
c->state != BCH_FS_RO)
goto out;
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
err = "error starting allocator thread";
for_each_rw_member(ca, c, i)
if (bch2_dev_allocator_start(ca)) {
@ -484,6 +491,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
mutex_init(&c->replicas_gc_lock);
mutex_init(&c->btree_cache_lock);
mutex_init(&c->bucket_lock);
mutex_init(&c->btree_root_lock);
@ -603,7 +611,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
if (bch2_dev_exists(c->disk_sb, mi, i) &&
bch2_dev_alloc(c, i))
goto err;
@ -681,12 +689,16 @@ static const char *__bch2_fs_start(struct bch_fs *c)
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned i, id;
time64_t now;
LIST_HEAD(journal);
struct jset *j;
struct closure cl;
u64 journal_seq = 0;
time64_t now;
unsigned i;
int ret = -EINVAL;
closure_init_stack(&cl);
BUG_ON(c->state != BCH_FS_STARTING);
mutex_lock(&c->sb_lock);
@ -694,6 +706,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch2_sb_from_fs(c, ca);
mutex_unlock(&c->sb_lock);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (BCH_SB_INITIALIZED(c->disk_sb)) {
ret = bch2_journal_read(c, &journal);
if (ret)
@ -704,44 +720,45 @@ static const char *__bch2_fs_start(struct bch_fs *c)
c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
err = "error reading priorities";
for_each_readable_member(ca, c, i) {
ret = bch2_prio_read(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
}
for (id = 0; id < BTREE_ID_NR; id++) {
for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
struct bkey_i *k;
err = "bad btree root";
k = bch2_journal_find_btree_root(c, j, id, &level);
if (!k && id == BTREE_ID_EXTENTS)
err = "missing btree root";
k = bch2_journal_find_btree_root(c, j, i, &level);
if (!k && i < BTREE_ID_ALLOC)
goto err;
if (!k) {
pr_debug("missing btree root: %d", id);
if (!k)
continue;
}
err = "error reading btree root";
if (bch2_btree_root_read(c, id, k, level))
if (bch2_btree_root_read(c, i, k, level))
goto err;
}
bch_verbose(c, "starting mark and sweep:");
err = "error reading allocation information";
ret = bch2_alloc_read(c, &journal);
if (ret)
goto err;
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
if (c->opts.noreplay)
goto recovery_done;
bch_verbose(c, "mark and sweep done");
err = "cannot allocate new btree root";
for (i = 0; i < BTREE_ID_NR; i++)
if (!c->btree_roots[i].b &&
bch2_btree_root_alloc(c, i, &cl))
goto err;
closure_sync(&cl);
/*
* bch2_journal_start() can't happen sooner, or btree_gc_finish()
@ -758,12 +775,10 @@ static const char *__bch2_fs_start(struct bch_fs *c)
}
bch_verbose(c, "starting journal replay:");
err = "journal replay failed";
ret = bch2_journal_replay(c, &journal);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
if (c->opts.norecovery)
@ -774,23 +789,21 @@ static const char *__bch2_fs_start(struct bch_fs *c)
ret = bch2_fsck(c, !c->opts.nofsck);
if (ret)
goto err;
bch_verbose(c, "fsck done");
for_each_rw_member(ca, c, i)
if (ca->need_prio_write) {
ret = bch2_prio_write(ca);
if (ca->need_alloc_write) {
ret = bch2_alloc_write(c, ca, &journal_seq);
if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
}
bch_verbose(c, "fsck done");
bch2_journal_flush_seq(&c->journal, journal_seq);
} else {
struct bch_inode_unpacked inode;
struct bkey_inode_buf packed_inode;
struct closure cl;
closure_init_stack(&cl);
bch_notice(c, "initializing new filesystem");
@ -805,6 +818,11 @@ static const char *__bch2_fs_start(struct bch_fs *c)
goto err;
}
err = "cannot allocate new btree root";
for (i = 0; i < BTREE_ID_NR; i++)
if (bch2_btree_root_alloc(c, i, &cl))
goto err;
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
@ -819,13 +837,6 @@ static const char *__bch2_fs_start(struct bch_fs *c)
goto err;
}
err = "cannot allocate new btree root";
for (id = 0; id < BTREE_ID_NR; id++)
if (bch2_btree_root_alloc(c, id, &cl)) {
closure_sync(&cl);
goto err;
}
/* Wait for new btree roots to be written: */
closure_sync(&cl);
@ -877,6 +888,8 @@ out:
bch2_journal_entries_free(&journal);
return err;
err:
closure_sync(&cl);
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
@ -940,10 +953,7 @@ static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
if (uuid_le_cmp(fs->uuid, sb->uuid))
return "device not a member of filesystem";
if (sb->dev_idx >= newest->nr_devices)
return "device has invalid dev_idx";
if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
if (!bch2_dev_exists(newest, mi, sb->dev_idx))
return "device has been removed";
if (fs->block_size != sb->block_size)
@ -981,9 +991,6 @@ static void bch2_dev_free(struct bch_dev *ca)
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->usage_percpu);
kvpfree(ca->disk_buckets, bucket_bytes(ca));
kfree(ca->prio_buckets);
kfree(ca->bio_prio);
kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
free_heap(&ca->copygc_heap);
@ -1011,7 +1018,7 @@ static void __bch2_dev_offline(struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
__bch2_dev_read_only(ca->fs, ca);
__bch2_dev_read_only(c, ca);
reinit_completion(&ca->offline_complete);
percpu_ref_kill(&ca->io_ref);
@ -1061,7 +1068,7 @@ static int bch2_dev_sysfs_online(struct bch_dev *ca)
return 0;
if (!ca->kobj.state_in_sysfs) {
ret = kobject_add(&ca->kobj, &ca->fs->kobj,
ret = kobject_add(&ca->kobj, &c->kobj,
"dev-%u", ca->dev_idx);
if (ret)
return ret;
@ -1087,7 +1094,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
struct bch_member *member;
size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
size_t heap_size;
unsigned i;
unsigned i, btree_node_reserve_buckets;
struct bch_dev *ca;
if (bch2_fs_init_fault("dev_alloc"))
@ -1107,8 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->dev_idx = dev_idx;
spin_lock_init(&ca->freelist_lock);
spin_lock_init(&ca->prio_buckets_lock);
mutex_init(&ca->prio_write_lock);
bch2_dev_moving_gc_init(ca);
INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work);
@ -1134,12 +1139,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
free_inc_reserve = movinggc_reserve / 2;
heap_size = movinggc_reserve * 8;
btree_node_reserve_buckets =
DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->sb.btree_node_size);
if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC],
movinggc_reserve, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
@ -1152,18 +1161,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
!(ca->buckets = kvpmalloc(ca->mi.nbuckets *
sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
!(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
!(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
!(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
!(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
goto err;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
total_reserve = ca->free_inc.size;
for (i = 0; i < RESERVE_NR; i++)
total_reserve += ca->free[i].size;
@ -1232,53 +1235,48 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
lg_local_lock(&c->usage_lock);
if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
bch2_mark_dev_metadata(ca->fs, ca);
bch2_mark_dev_metadata(c, ca);
lg_local_unlock(&c->usage_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(ca->disk_sb.sb);
bool has_journal =
bch2_nr_journal_buckets(journal_buckets) >=
BCH_JOURNAL_BUCKETS_MIN;
bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
bch2_dev_group_add(&c->all_devs, ca);
if (has_journal)
bch2_dev_group_add(&c->journal.devs, ca);
}
percpu_ref_reinit(&ca->io_ref);
return 0;
}
/* Device management: */
bool bch2_fs_may_start(struct bch_fs *c, int flags)
static bool have_enough_devs(struct bch_fs *c,
struct replicas_status s,
unsigned flags)
{
struct bch_sb_field_members *mi;
unsigned meta_missing = 0;
unsigned data_missing = 0;
bool degraded = false;
unsigned i;
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->disk_sb->nr_devices; i++)
if (!c->devs[i] &&
!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
degraded = true;
if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
meta_missing++;
if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
data_missing++;
}
mutex_unlock(&c->sb_lock);
if (degraded &&
!(flags & BCH_FORCE_IF_DEGRADED))
return false;
if (meta_missing &&
if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
s.replicas[BCH_DATA_BTREE].nr_offline) &&
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
return false;
if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
!s.replicas[BCH_DATA_BTREE].nr_online) &&
!(flags & BCH_FORCE_IF_METADATA_LOST))
return false;
if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
if (s.replicas[BCH_DATA_USER].nr_offline &&
!(flags & BCH_FORCE_IF_DATA_DEGRADED))
return false;
if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
if (!s.replicas[BCH_DATA_USER].nr_online &&
!(flags & BCH_FORCE_IF_DATA_LOST))
return false;
@ -1297,40 +1295,80 @@ bool bch2_fs_may_start(struct bch_fs *c, int flags)
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct replicas_status s;
struct bch_dev *ca2;
int i, nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
if (new_state == BCH_MEMBER_STATE_RW)
switch (new_state) {
case BCH_MEMBER_STATE_RW:
return true;
case BCH_MEMBER_STATE_RO:
if (ca->mi.state != BCH_MEMBER_STATE_RW)
return true;
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
return true;
/* do we have enough devices to write to? */
for_each_member_device(ca2, c, i)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
/*
* If the device is already offline - whatever is going on with it can't
* possible make the FS need to go RO:
*/
if (!bch2_dev_is_online(ca))
return true;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
: c->opts.metadata_replicas_required,
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
: c->opts.data_replicas_required);
if (ca->mi.has_data &&
!(flags & BCH_FORCE_IF_DATA_DEGRADED))
return false;
return nr_rw - 1 <= required;
case BCH_MEMBER_STATE_FAILED:
case BCH_MEMBER_STATE_SPARE:
if (ca->mi.state != BCH_MEMBER_STATE_RW &&
ca->mi.state != BCH_MEMBER_STATE_RO)
return true;
if (ca->mi.has_data &&
c->sb.data_replicas_have <= 1 &&
!(flags & BCH_FORCE_IF_DATA_LOST))
return false;
/* do we have enough devices to read from? */
s = __bch2_replicas_status(c, ca);
if (ca->mi.has_metadata &&
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
return false;
pr_info("replicas: j %u %u b %u %u d %u %u",
s.replicas[BCH_DATA_JOURNAL].nr_online,
s.replicas[BCH_DATA_JOURNAL].nr_offline,
if (ca->mi.has_metadata &&
c->sb.meta_replicas_have <= 1 &&
!(flags & BCH_FORCE_IF_METADATA_LOST))
return false;
s.replicas[BCH_DATA_BTREE].nr_online,
s.replicas[BCH_DATA_BTREE].nr_offline,
return true;
s.replicas[BCH_DATA_USER].nr_online,
s.replicas[BCH_DATA_USER].nr_offline);
return have_enough_devs(c, s, flags);
default:
BUG();
}
}
static bool bch2_fs_may_start(struct bch_fs *c, int flags)
{
struct replicas_status s;
struct bch_sb_field_members *mi;
unsigned i;
if (!c->opts.degraded) {
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->disk_sb->nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
!bch2_dev_is_online(c->devs[i]) &&
(c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
mutex_unlock(&c->sb_lock);
return false;
}
mutex_unlock(&c->sb_lock);
}
s = bch2_replicas_status(c);
return have_enough_devs(c, s, flags);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
@ -1343,8 +1381,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* complete.
*/
bch2_dev_allocator_stop(ca);
bch2_dev_group_remove(&c->journal.devs, ca);
bch2_dev_allocator_remove(c, ca);
}
static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
@ -1353,6 +1390,9 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (bch2_dev_allocator_start(ca))
return "error starting allocator thread";
@ -1411,7 +1451,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_sb_field_members *mi;
unsigned dev_idx = ca->dev_idx;
unsigned dev_idx = ca->dev_idx, data;
int ret = -EINVAL;
mutex_lock(&c->state_lock);
@ -1439,19 +1479,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
if (ca->mi.has_data || ca->mi.has_metadata) {
bch_err(ca, "Remove failed, still has data");
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Remove failed, still has data (%x)", data);
goto err;
}
/*
* Ok, really doing the remove:
* Drop device's prio pointer before removing it from superblock:
*/
spin_lock(&c->journal.lock);
c->journal.prio_buckets[dev_idx] = 0;
spin_unlock(&c->journal.lock);
bch2_journal_meta(&c->journal);
__bch2_dev_offline(ca);
@ -1476,6 +1509,7 @@ err:
return ret;
}
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
struct bcache_superblock sb;
@ -1490,7 +1524,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (err)
return -EINVAL;
err = bch2_validate_cache_super(&sb);
err = bch2_sb_validate(&sb);
if (err)
return -EINVAL;
@ -1514,9 +1548,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
mi = bch2_sb_get_members(c->disk_sb);
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
if (dev_idx >= c->sb.nr_devices ||
bch2_is_zero(mi->members[dev_idx].uuid.b,
sizeof(uuid_le)))
if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
goto have_slot;
no_slot:
err = "no slots available in superblock";
@ -1587,13 +1619,13 @@ err:
return ret ?: -EINVAL;
}
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
struct bcache_superblock sb = { 0 };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
int ret;
mutex_lock(&c->state_lock);
@ -1616,12 +1648,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
mutex_unlock(&c->sb_lock);
ca = c->devs[dev_idx];
ret = bch2_prio_read(ca);
if (ret) {
err = "error reading priorities";
goto err;
}
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
@ -1656,6 +1682,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
unsigned data;
int ret;
mutex_lock(&c->state_lock);
@ -1680,8 +1707,9 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
if (ca->mi.has_data || ca->mi.has_metadata) {
bch_err(ca, "Migrate error: data still present");
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data);
return -EINVAL;
}
@ -1714,11 +1742,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
if (err)
goto err;
err = "attempting to register backing device";
if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
goto err;
err = bch2_validate_cache_super(&sb[i]);
err = bch2_sb_validate(&sb[i]);
if (err)
goto err;
}
@ -1790,7 +1814,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb,
struct bch_fs *c;
bool allocated_fs = false;
err = bch2_validate_cache_super(sb);
err = bch2_sb_validate(sb);
if (err)
return err;
@ -1855,11 +1879,7 @@ const char *bch2_fs_open_incremental(const char *path)
if (err)
return err;
if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
err = __bch2_fs_open_incremental(&sb, opts);
else
err = "not a bcachefs superblock";
err = __bch2_fs_open_incremental(&sb, opts);
bch2_free_super(&sb);
return err;

View File

@ -337,8 +337,8 @@ SHOW(bch2_fs)
sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
/* Debugging: */
@ -693,7 +693,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n"
"free[RESERVE_PRIO]: %zu/%zu\n"
"free[RESERVE_BTREE]: %zu/%zu\n"
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
"free[RESERVE_NONE]: %zu/%zu\n"
@ -705,7 +704,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
"open buckets: %u/%u (reserved %u)\n"
"open_buckets_wait: %s\n",
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size,
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
@ -759,8 +757,11 @@ SHOW(bch2_dev)
sysfs_print(alloc_buckets, stats.buckets_alloc);
sysfs_print(available_buckets, dev_buckets_available(ca));
sysfs_print(free_buckets, dev_buckets_free(ca));
sysfs_print(has_data, ca->mi.has_data);
sysfs_print(has_metadata, ca->mi.has_metadata);
sysfs_print(has_data, bch2_dev_has_data(c, ca) &
(1 << BCH_DATA_USER));
sysfs_print(has_metadata, bch2_dev_has_data(c, ca) &
((1 << BCH_DATA_JOURNAL)|
(1 << BCH_DATA_BTREE)));
sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);

View File

@ -533,3 +533,47 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
}
}
}
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t size))
{
/* pre-scale counters for performance */
int i = (num/2 - 1) * size, n = num * size, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for ( ; i >= 0; i -= size) {
for (r = i; r * 2 + size < n; r = c) {
c = r * 2 + size;
if (c < n - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
/* sort */
for (i = n - size; i > 0; i -= size) {
swap_func(base, base + i, size);
for (r = 0; r * 2 + size < i; r = c) {
c = r * 2 + size;
if (c < i - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
}

View File

@ -763,4 +763,8 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio,
size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
#endif /* _BCACHE_UTIL_H */