Update bcachefs sources to edf5f38218 bcachefs: Refactor superblock code

This commit is contained in:
Kent Overstreet 2018-04-10 19:19:09 -04:00
parent ff5e165532
commit c598d91dcb
59 changed files with 2532 additions and 2221 deletions

View File

@ -1 +1 @@
9fc6ccd8659598d4ca885220a795889071b619f4
edf5f38218f699e53913a549465f35d36c4418f7

View File

@ -86,7 +86,7 @@ int cmd_set_passphrase(int argc, char *argv[])
if (IS_ERR(c))
die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
die("Filesystem does not have encryption enabled");
@ -100,7 +100,7 @@ int cmd_set_passphrase(int argc, char *argv[])
char *new_passphrase = read_passphrase_twice("Enter new passphrase: ");
struct bch_key passphrase_key = derive_passphrase(crypt, new_passphrase);
if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb),
if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb.sb),
&new_key, sizeof(new_key)))
die("error encrypting key");
crypt->key = new_key;
@ -123,7 +123,7 @@ int cmd_remove_passphrase(int argc, char *argv[])
if (IS_ERR(c))
die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
die("Filesystem does not have encryption enabled");

View File

@ -31,6 +31,7 @@
#include "libbcachefs/fs.h"
#include "libbcachefs/inode.h"
#include "libbcachefs/io.h"
#include "libbcachefs/replicas.h"
#include "libbcachefs/str_hash.h"
#include "libbcachefs/super.h"
#include "libbcachefs/xattr.h"

View File

@ -15,7 +15,7 @@
#define BUG_ON(cond) assert(!(cond))
#define WARN_ON_ONCE(cond) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define WARN_ONCE(cond, ...) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define __WARN() assert(0)
#define __WARN_printf(arg...) assert(0)

View File

@ -319,7 +319,7 @@ TRACE_EVENT(btree_gc_coalesce_fail,
TP_fast_assign(
__entry->reason = reason;
memcpy(__entry->uuid, c->disk_sb->user_uuid.b, 16);
memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
),
TP_printk("%pU: %u", __entry->uuid, __entry->reason)

View File

@ -14,12 +14,14 @@
#include <uuid/uuid.h>
#include "libbcachefs/bcachefs_format.h"
#include "libbcachefs/checksum.h"
#include "crypto.h"
#include "libbcachefs.h"
#include "crypto.h"
#include "libbcachefs/bcachefs_format.h"
#include "libbcachefs/btree_cache.h"
#include "libbcachefs/checksum.h"
#include "libbcachefs/disk_groups.h"
#include "libbcachefs/opts.h"
#include "libbcachefs/replicas.h"
#include "libbcachefs/super-io.h"
#define NSEC_PER_SEC 1000000000L
@ -124,8 +126,8 @@ void bch2_pick_bucket_size(struct format_opts opts, struct dev_opts *dev)
}
static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
struct bch_sb_field_disk_groups *gi,
static unsigned parse_target(struct bch_sb_handle *sb,
struct dev_opts *devs, size_t nr_devs,
const char *s)
{
struct dev_opts *i;
@ -138,7 +140,7 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
if (!strcmp(s, i->path))
return dev_to_target(i - devs);
idx = __bch2_disk_group_find(gi, s);
idx = bch2_disk_path_find(sb, s);
if (idx >= 0)
return group_to_target(idx);
@ -149,11 +151,9 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
struct bch_sb *bch2_format(struct format_opts opts,
struct dev_opts *devs, size_t nr_devs)
{
struct bch_sb *sb;
struct bch_sb_handle sb = { NULL };
struct dev_opts *i;
struct bch_sb_field_members *mi;
struct bch_sb_field_disk_groups *gi = NULL;
unsigned u64s;
/* calculate block size: */
if (!opts.block_size)
@ -184,58 +184,51 @@ struct bch_sb *bch2_format(struct format_opts opts,
if (uuid_is_null(opts.uuid.b))
uuid_generate(opts.uuid.b);
sb = calloc(1, sizeof(*sb) +
sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devs +
sizeof(struct bch_sb_field_disk_groups) +
sizeof(struct bch_disk_group) * nr_devs +
sizeof(struct bch_sb_field_crypt));
if (bch2_sb_realloc(&sb, 0))
die("insufficient memory");
sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
sb->magic = BCACHE_MAGIC;
sb->block_size = cpu_to_le16(opts.block_size);
sb->user_uuid = opts.uuid;
sb->nr_devices = nr_devs;
sb.sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
sb.sb->magic = BCACHE_MAGIC;
sb.sb->block_size = cpu_to_le16(opts.block_size);
sb.sb->user_uuid = opts.uuid;
sb.sb->nr_devices = nr_devs;
uuid_generate(sb->uuid.b);
uuid_generate(sb.sb->uuid.b);
if (opts.label)
strncpy((char *) sb->label, opts.label, sizeof(sb->label));
strncpy((char *) sb.sb->label, opts.label, sizeof(sb.sb->label));
SET_BCH_SB_CSUM_TYPE(sb, opts.meta_csum_type);
SET_BCH_SB_META_CSUM_TYPE(sb, opts.meta_csum_type);
SET_BCH_SB_DATA_CSUM_TYPE(sb, opts.data_csum_type);
SET_BCH_SB_COMPRESSION_TYPE(sb, opts.compression_type);
SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb, opts.background_compression_type);
SET_BCH_SB_CSUM_TYPE(sb.sb, opts.meta_csum_type);
SET_BCH_SB_META_CSUM_TYPE(sb.sb, opts.meta_csum_type);
SET_BCH_SB_DATA_CSUM_TYPE(sb.sb, opts.data_csum_type);
SET_BCH_SB_COMPRESSION_TYPE(sb.sb, opts.compression_type);
SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb.sb,
opts.background_compression_type);
SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size);
SET_BCH_SB_GC_RESERVE(sb, 8);
SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas);
SET_BCH_SB_META_REPLICAS_REQ(sb, opts.meta_replicas_required);
SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas);
SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required);
SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action);
SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH);
SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, ilog2(opts.encoded_extent_max));
SET_BCH_SB_BTREE_NODE_SIZE(sb.sb, opts.btree_node_size);
SET_BCH_SB_GC_RESERVE(sb.sb, 8);
SET_BCH_SB_META_REPLICAS_WANT(sb.sb, opts.meta_replicas);
SET_BCH_SB_META_REPLICAS_REQ(sb.sb, opts.meta_replicas_required);
SET_BCH_SB_DATA_REPLICAS_WANT(sb.sb, opts.data_replicas);
SET_BCH_SB_DATA_REPLICAS_REQ(sb.sb, opts.data_replicas_required);
SET_BCH_SB_ERROR_ACTION(sb.sb, opts.on_error_action);
SET_BCH_SB_STR_HASH_TYPE(sb.sb, BCH_STR_HASH_SIPHASH);
SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,ilog2(opts.encoded_extent_max));
SET_BCH_SB_POSIX_ACL(sb, 1);
SET_BCH_SB_POSIX_ACL(sb.sb, 1);
struct timespec now;
if (clock_gettime(CLOCK_REALTIME, &now))
die("error getting current time: %m");
sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
sb->time_precision = cpu_to_le32(1);
mi = vstruct_end(sb);
u64s = (sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devs) / sizeof(u64);
le32_add_cpu(&sb->u64s, u64s);
le32_add_cpu(&mi->field.u64s, u64s);
mi->field.type = BCH_SB_FIELD_members;
sb.sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
sb.sb->time_precision = cpu_to_le32(1);
/* Member info: */
mi = bch2_sb_resize_members(&sb,
(sizeof(*mi) + sizeof(struct bch_member) *
nr_devs) / sizeof(u64));
for (i = devs; i < devs + nr_devs; i++) {
struct bch_member *m = mi->members + (i - devs);
@ -253,63 +246,38 @@ struct bch_sb *bch2_format(struct format_opts opts,
/* Disk groups */
for (i = devs; i < devs + nr_devs; i++) {
struct bch_member *m = mi->members + (i - devs);
struct bch_disk_group *g;
size_t len;
int idx;
if (!i->group)
continue;
len = min_t(size_t, strlen(i->group) + 1, BCH_SB_LABEL_SIZE);
idx = bch2_disk_path_find_or_create(&sb, i->group);
if (idx < 0)
die("error creating disk path: %s", idx);
if (!gi) {
gi = vstruct_end(sb);
u64s = sizeof(*gi) / sizeof(u64);
le32_add_cpu(&sb->u64s, u64s);
le32_add_cpu(&gi->field.u64s, u64s);
gi->field.type = BCH_SB_FIELD_disk_groups;
}
idx = __bch2_disk_group_find(gi, i->group);
if (idx >= 0) {
g = gi->entries + idx;
} else {
u64s = sizeof(*g) / sizeof(u64);
g = vstruct_end(&gi->field);
le32_add_cpu(&sb->u64s, u64s);
le32_add_cpu(&gi->field.u64s, u64s);
memcpy(g->label, i->group, len);
SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
}
SET_BCH_MEMBER_GROUP(m, (g - gi->entries) + 1);
SET_BCH_MEMBER_GROUP(m, idx + 1);
}
SET_BCH_SB_FOREGROUND_TARGET(sb,
parse_target(devs, nr_devs, gi, opts.foreground_target));
SET_BCH_SB_BACKGROUND_TARGET(sb,
parse_target(devs, nr_devs, gi, opts.background_target));
SET_BCH_SB_PROMOTE_TARGET(sb,
parse_target(devs, nr_devs, gi, opts.promote_target));
SET_BCH_SB_FOREGROUND_TARGET(sb.sb,
parse_target(&sb, devs, nr_devs, opts.foreground_target));
SET_BCH_SB_BACKGROUND_TARGET(sb.sb,
parse_target(&sb, devs, nr_devs, opts.background_target));
SET_BCH_SB_PROMOTE_TARGET(sb.sb,
parse_target(&sb, devs, nr_devs, opts.promote_target));
/* Crypt: */
if (opts.encrypted) {
struct bch_sb_field_crypt *crypt = vstruct_end(sb);
struct bch_sb_field_crypt *crypt =
bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64));
u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
le32_add_cpu(&sb->u64s, u64s);
crypt->field.u64s = cpu_to_le32(u64s);
crypt->field.type = BCH_SB_FIELD_crypt;
bch_sb_crypt_init(sb, crypt, opts.passphrase);
SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
}
for (i = devs; i < devs + nr_devs; i++) {
sb->dev_idx = i - devs;
sb.sb->dev_idx = i - devs;
init_layout(&sb->layout, opts.block_size,
init_layout(&sb.sb->layout, opts.block_size,
i->sb_offset, i->sb_end);
if (i->sb_offset == BCH_SB_SECTOR) {
@ -319,11 +287,11 @@ struct bch_sb *bch2_format(struct format_opts opts,
xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
}
bch2_super_write(i->fd, sb);
bch2_super_write(i->fd, sb.sb);
close(i->fd);
}
return sb;
return sb.sb;
}
void bch2_super_write(int fd, struct bch_sb *sb)
@ -553,11 +521,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
struct bch_sb_field_ops {
struct bch_sb_field_toolops {
sb_field_print_fn print;
};
static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
static const struct bch_sb_field_toolops bch2_sb_field_ops[] = {
#define x(f, nr) \
[BCH_SB_FIELD_##f] = { \
.print = bch2_sb_print_##f, \

View File

@ -58,11 +58,13 @@
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "checksum.h"
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "io.h"
@ -79,7 +81,7 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
/* Ratelimiting/PD controllers */
@ -130,8 +132,7 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
static const char *bch2_alloc_invalid(const struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (k.k->p.inode >= c->sb.nr_devices ||
!c->devs[k.k->p.inode])
@ -152,8 +153,8 @@ static const char *bch2_alloc_invalid(const struct bch_fs *c,
return NULL;
}
static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_alloc_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
buf[0] = '\0';
@ -163,11 +164,6 @@ static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
}
}
const struct bkey_ops bch2_bkey_alloc_ops = {
.key_invalid = bch2_alloc_invalid,
.val_to_text = bch2_alloc_to_text,
};
static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
{
unsigned v;
@ -236,9 +232,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
d = a.v->data;
if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
g->prio[READ] = get_alloc_field(&d, 2);
g->io_time[READ] = get_alloc_field(&d, 2);
if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
g->prio[WRITE] = get_alloc_field(&d, 2);
g->io_time[WRITE] = get_alloc_field(&d, 2);
lg_local_unlock(&c->usage_lock);
}
@ -270,21 +266,21 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
mutex_lock(&c->prio_clock[READ].lock);
mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
bch2_recalc_min_prio(c, ca, READ);
bch2_recalc_oldest_io(c, ca, READ);
up_read(&ca->bucket_lock);
}
mutex_unlock(&c->prio_clock[READ].lock);
mutex_unlock(&c->bucket_clock[READ].lock);
mutex_lock(&c->prio_clock[WRITE].lock);
mutex_lock(&c->bucket_clock[WRITE].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
bch2_recalc_min_prio(c, ca, WRITE);
bch2_recalc_oldest_io(c, ca, WRITE);
up_read(&ca->bucket_lock);
}
mutex_unlock(&c->prio_clock[WRITE].lock);
mutex_unlock(&c->bucket_clock[WRITE].lock);
return 0;
}
@ -320,9 +316,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
d = a->v.data;
if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
put_alloc_field(&d, 2, g->prio[READ]);
put_alloc_field(&d, 2, g->io_time[READ]);
if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
put_alloc_field(&d, 2, g->prio[WRITE]);
put_alloc_field(&d, 2, g->io_time[WRITE]);
lg_local_unlock(&c->usage_lock);
ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
@ -395,38 +391,34 @@ int bch2_alloc_write(struct bch_fs *c)
/* Bucket IO clocks: */
static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
{
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets = bucket_array(ca);
struct bucket *g;
u16 max_delta = 1;
u16 max_last_io = 0;
unsigned i;
lockdep_assert_held(&c->prio_clock[rw].lock);
lockdep_assert_held(&c->bucket_clock[rw].lock);
/* Determine min prio for this particular device */
/* Recalculate max_last_io for this device: */
for_each_bucket(g, buckets)
max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
ca->min_prio[rw] = clock->hand - max_delta;
ca->max_last_bucket_io[rw] = max_last_io;
/*
* This may possibly increase the min prio for the whole device, check
* that as well.
*/
max_delta = 1;
/* Recalculate global max_last_io: */
max_last_io = 0;
for_each_member_device(ca, c, i)
max_delta = max(max_delta,
(u16) (clock->hand - ca->min_prio[rw]));
max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
clock->min_prio = clock->hand - max_delta;
clock->max_last_io = max_last_io;
}
static void bch2_rescale_prios(struct bch_fs *c, int rw)
static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
{
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets;
struct bch_dev *ca;
struct bucket *g;
@ -439,10 +431,10 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
g->prio[rw] = clock->hand -
(clock->hand - g->prio[rw]) / 2;
g->io_time[rw] = clock->hand -
bucket_last_io(c, g, rw) / 2;
bch2_recalc_min_prio(c, ca, rw);
bch2_recalc_oldest_io(c, ca, rw);
up_read(&ca->bucket_lock);
}
@ -450,19 +442,26 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
static void bch2_inc_clock_hand(struct io_timer *timer)
{
struct prio_clock *clock = container_of(timer,
struct prio_clock, rescale);
struct bucket_clock *clock = container_of(timer,
struct bucket_clock, rescale);
struct bch_fs *c = container_of(clock,
struct bch_fs, prio_clock[clock->rw]);
struct bch_fs, bucket_clock[clock->rw]);
struct bch_dev *ca;
u64 capacity;
unsigned i;
mutex_lock(&clock->lock);
clock->hand++;
/* if clock cannot be advanced more, rescale prio */
if (clock->hand == (u16) (clock->min_prio - 1))
bch2_rescale_prios(c, clock->rw);
if (clock->max_last_io >= U16_MAX - 2)
bch2_rescale_bucket_io_times(c, clock->rw);
BUG_ON(clock->max_last_io >= U16_MAX - 2);
for_each_member_device(ca, c, i)
ca->max_last_bucket_io[clock->rw]++;
clock->max_last_io++;
clock->hand++;
mutex_unlock(&clock->lock);
@ -484,9 +483,9 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
bch2_io_timer_add(&c->io_clock[clock->rw], timer);
}
static void bch2_prio_timer_init(struct bch_fs *c, int rw)
static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
{
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket_clock *clock = &c->bucket_clock[rw];
clock->hand = 1;
clock->rw = rw;
@ -536,7 +535,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
ret = -1;
ret = 1;
break;
}
@ -635,13 +634,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark m)
{
unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
unsigned max_last_io = ca->max_last_bucket_io[READ];
/*
* Time since last read, scaled to [0, 8) where larger value indicates
* more recently read data:
*/
unsigned long hotness =
(bucket(ca, b)->prio[READ] - ca->min_prio[READ]) * 7 /
(c->prio_clock[READ].hand - ca->min_prio[READ]);
unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
/* How much we want to keep the data in this bucket: */
unsigned long data_wantness =
@ -659,23 +659,25 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
struct alloc_heap_entry l,
struct alloc_heap_entry r)
{
return (l.key > r.key) - (l.key < r.key);
return (l.key > r.key) - (l.key < r.key) ?:
(l.nr < r.nr) - (l.nr > r.nr) ?:
(l.bucket > r.bucket) - (l.bucket < r.bucket);
}
static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e;
struct alloc_heap_entry e = { 0 };
size_t b;
ca->alloc_heap.used = 0;
mutex_lock(&c->prio_clock[READ].lock);
mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
bch2_recalc_min_prio(c, ca, READ);
bch2_recalc_oldest_io(c, ca, READ);
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@ -684,30 +686,45 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
unsigned long key = bucket_sort_key(c, ca, b, m);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
e = (struct alloc_heap_entry) {
.bucket = b,
.key = bucket_sort_key(c, ca, b, m)
};
if (e.nr && e.bucket + e.nr == b && e.key == key) {
e.nr++;
} else {
if (e.nr)
heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
e = (struct alloc_heap_entry) {
.bucket = b,
.nr = 1,
.key = key,
};
}
cond_resched();
}
if (e.nr)
heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
up_read(&ca->bucket_lock);
mutex_unlock(&c->prio_clock[READ].lock);
mutex_unlock(&c->bucket_clock[READ].lock);
heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
/*
* If we run out of buckets to invalidate, bch2_allocator_thread() will
* kick stuff and retry us
*/
while (!fifo_full(&ca->free_inc) &&
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
bch2_invalidate_one_bucket(c, ca, e.bucket);
while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
for (b = e.bucket;
b < e.bucket + e.nr;
b++) {
if (fifo_full(&ca->free_inc))
return;
bch2_invalidate_one_bucket(c, ca, b);
}
}
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@ -729,6 +746,8 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
if (bch2_can_invalidate_bucket(ca, b, m))
bch2_invalidate_one_bucket(c, ca, b);
cond_resched();
}
}
@ -749,6 +768,8 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
if (bch2_can_invalidate_bucket(ca, b, m))
bch2_invalidate_one_bucket(c, ca, b);
cond_resched();
}
}
@ -850,7 +871,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
if ((current->flags & PF_KTHREAD) &&
kthread_should_stop()) {
ret = -1;
ret = 1;
break;
}
@ -880,7 +901,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
ca->mi.bucket_size, GFP_NOIO, 0);
if (push_invalidated_bucket(c, ca, bucket))
return -1;
return 1;
}
return 0;
@ -905,17 +926,32 @@ static int bch2_allocator_thread(void *arg)
while (1) {
while (1) {
cond_resched();
pr_debug("discarding %zu invalidated buckets",
ca->nr_invalidated);
ret = discard_invalidated_buckets(c, ca);
if (ret)
return 0;
goto stop;
if (fifo_empty(&ca->free_inc))
break;
pr_debug("invalidating %zu buckets",
fifo_used(&ca->free_inc));
journal_seq = 0;
ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
if (ret)
return 0;
if (ret) {
bch_err(ca, "error invalidating buckets: %i", ret);
goto stop;
}
if (!ca->nr_invalidated) {
bch_err(ca, "allocator thread unable to make forward progress!");
goto stop;
}
if (ca->allocator_invalidating_data)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@ -927,22 +963,29 @@ static int bch2_allocator_thread(void *arg)
* journal error - buckets haven't actually been
* invalidated, can't discard them:
*/
if (ret)
return 0;
if (ret) {
bch_err(ca, "journal error: %i", ret);
goto stop;
}
}
pr_debug("free_inc now empty");
/* Reset front/back so we can easily sort fifo entries later: */
ca->free_inc.front = ca->free_inc.back = 0;
ca->allocator_journal_seq_flush = 0;
ca->allocator_invalidating_data = false;
down_read(&c->gc_lock);
if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
up_read(&c->gc_lock);
return 0;
}
while (1) {
size_t prev = fifo_used(&ca->free_inc);
if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
up_read(&c->gc_lock);
bch_err(ca, "gc failure");
goto stop;
}
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
@ -950,7 +993,14 @@ static int bch2_allocator_thread(void *arg)
* another cache tier
*/
pr_debug("scanning for reclaimable buckets");
find_reclaimable_buckets(c, ca);
pr_debug("found %zu buckets (free_inc %zu/%zu)",
fifo_used(&ca->free_inc) - prev,
fifo_used(&ca->free_inc), ca->free_inc.size);
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
@ -977,15 +1027,20 @@ static int bch2_allocator_thread(void *arg)
ca->allocator_blocked = true;
closure_wake_up(&c->freelist_wait);
if (wait_buckets_available(c, ca)) {
ret = wait_buckets_available(c, ca);
if (ret) {
up_read(&c->gc_lock);
return 0;
goto stop;
}
}
ca->allocator_blocked = false;
up_read(&c->gc_lock);
pr_debug("free_inc now %zu/%zu",
fifo_used(&ca->free_inc),
ca->free_inc.size);
sort_free_inc(c, ca);
/*
@ -993,6 +1048,10 @@ static int bch2_allocator_thread(void *arg)
* write out the new bucket gens:
*/
}
stop:
pr_debug("alloc thread stopping (ret %i)", ret);
return 0;
}
/* Allocation */
@ -1046,8 +1105,8 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
return ob;
}
/* _only_ for allocating the journal and btree roots on a brand new fs: */
int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
/* _only_ for allocating the journal on a new device: */
long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
{
struct bucket_array *buckets;
ssize_t b;
@ -1056,14 +1115,8 @@ int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
buckets = bucket_array(ca);
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
if (is_available_bucket(buckets->b[b].mark)) {
bch2_mark_alloc_bucket(c, ca, b, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
set_bit(b, ca->buckets_dirty);
if (is_available_bucket(buckets->b[b].mark))
goto success;
}
b = -1;
success:
rcu_read_unlock();
@ -1135,9 +1188,8 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
break;
}
if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
(bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
goto out;
if (cl)
closure_wait(&c->freelist_wait, cl);
spin_unlock(&c->freelist_lock);
@ -1218,7 +1270,7 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
*v = *v < scale ? 0 : *v - scale;
}
static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
struct write_point *wp,
unsigned nr_replicas,
enum alloc_reserve reserve,
@ -1284,52 +1336,22 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
break;
}
}
rcu_read_unlock();
EBUG_ON(reserve == RESERVE_MOVINGGC &&
ret != ALLOC_SUCCESS &&
ret != OPEN_BUCKETS_EMPTY);
rcu_read_unlock();
return ret;
}
static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
unsigned nr_replicas,
enum alloc_reserve reserve,
struct bch_devs_mask *devs,
struct closure *cl)
{
bool waiting = false;
while (1) {
switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
reserve, devs, cl)) {
case ALLOC_SUCCESS:
if (waiting)
closure_wake_up(&c->freelist_wait);
return 0;
case NO_DEVICES:
if (waiting)
closure_wake_up(&c->freelist_wait);
return -EROFS;
case FREELIST_EMPTY:
if (!cl)
return -ENOSPC;
if (waiting)
return -EAGAIN;
/* Retry allocation after adding ourself to waitlist: */
closure_wait(&c->freelist_wait, cl);
waiting = true;
break;
case OPEN_BUCKETS_EMPTY:
return cl ? -EAGAIN : -ENOSPC;
default:
BUG();
}
switch (ret) {
case ALLOC_SUCCESS:
return 0;
case NO_DEVICES:
return -EROFS;
case FREELIST_EMPTY:
case OPEN_BUCKETS_EMPTY:
return cl ? -EAGAIN : -ENOSPC;
default:
BUG();
}
}
@ -1530,11 +1552,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
nr_ptrs_have = wp->first_ptr;
/* does writepoint have ptrs we don't want to use? */
writepoint_for_each_ptr(wp, ob, i)
if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
}
if (target)
writepoint_for_each_ptr(wp, ob, i)
if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
}
if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
ret = open_bucket_add_buckets(c, target, wp, devs_have,
@ -1551,7 +1574,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
nr_replicas, reserve, cl);
}
if (ret)
if (ret && ret != -EROFS)
goto err;
alloc_done:
/* check for more than one cache: */
@ -1584,6 +1607,13 @@ alloc_done:
nr_ptrs_effective += ca->mi.durability;
}
if (ret == -EROFS &&
nr_ptrs_effective >= nr_replicas_required)
ret = 0;
if (ret)
goto err;
if (nr_ptrs_effective > nr_replicas) {
writepoint_for_each_ptr(wp, ob, i) {
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
@ -1749,14 +1779,14 @@ void bch2_recalc_capacity(struct bch_fs *c)
if (c->capacity) {
bch2_io_timer_add(&c->io_clock[READ],
&c->prio_clock[READ].rescale);
&c->bucket_clock[READ].rescale);
bch2_io_timer_add(&c->io_clock[WRITE],
&c->prio_clock[WRITE].rescale);
&c->bucket_clock[WRITE].rescale);
} else {
bch2_io_timer_del(&c->io_clock[READ],
&c->prio_clock[READ].rescale);
&c->bucket_clock[READ].rescale);
bch2_io_timer_del(&c->io_clock[WRITE],
&c->prio_clock[WRITE].rescale);
&c->bucket_clock[WRITE].rescale);
}
/* Wake up case someone was waiting for buckets */
@ -1889,7 +1919,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
if (ca->alloc_thread)
return 0;
p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
p = kthread_create(bch2_allocator_thread, ca,
"bch_alloc[%s]", ca->name);
if (IS_ERR(p))
return PTR_ERR(p);
@ -1923,7 +1954,7 @@ static void allocator_start_issue_discards(struct bch_fs *c)
static int __bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
size_t bu, i, devs_have_enough = 0;
size_t bu, i;
unsigned dev_iter;
u64 journal_seq = 0;
bool invalidating_data = false;
@ -1964,16 +1995,21 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
/* did we find enough buckets? */
for_each_rw_member(ca, c, dev_iter)
devs_have_enough += (fifo_used(&ca->free_inc) >=
ca->free[RESERVE_BTREE].size);
if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
percpu_ref_put(&ca->io_ref);
goto not_enough;
}
if (devs_have_enough >= c->opts.metadata_replicas)
return 0;
return 0;
not_enough:
pr_debug("did not find enough empty buckets; issuing discards");
/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
for_each_rw_member(ca, c, dev_iter)
discard_invalidated_buckets(c, ca);
pr_debug("scanning for reclaimable buckets");
for_each_rw_member(ca, c, dev_iter) {
BUG_ON(!fifo_empty(&ca->free_inc));
ca->free_inc.front = ca->free_inc.back = 0;
@ -1988,6 +2024,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
break;
}
pr_debug("done scanning for reclaimable buckets");
/*
* We're moving buckets to freelists _before_ they've been marked as
* invalidated on disk - we have to so that we can allocate new btree
@ -1997,10 +2035,13 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
* have cached data in them, which is live until they're marked as
* invalidated on disk:
*/
if (invalidating_data)
if (invalidating_data) {
pr_debug("invalidating existing data");
set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
else
} else {
pr_debug("issuing discards");
allocator_start_issue_discards(c);
}
/*
* XXX: it's possible for this to deadlock waiting on journal reclaim,
@ -2017,13 +2058,15 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
}
if (invalidating_data) {
pr_debug("flushing journal");
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
if (ret)
return ret;
}
if (invalidating_data)
pr_debug("issuing discards");
allocator_start_issue_discards(c);
}
for_each_rw_member(ca, c, dev_iter)
while (ca->nr_invalidated) {
@ -2038,19 +2081,43 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
bool flush_updates;
size_t nr_pending_updates;
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
pr_debug("flushing dirty btree nodes");
cond_resched();
flush_updates = false;
nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_dirty(b) && (!b->written || b->level)) {
rcu_read_unlock();
six_lock_read(&b->lock);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
if (btree_node_may_write(b)) {
rcu_read_unlock();
six_lock_read(&b->lock);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
} else {
flush_updates = true;
}
}
rcu_read_unlock();
/*
* This is ugly, but it's needed to flush btree node writes
* without spinning...
*/
if (flush_updates) {
closure_wait_event(&c->btree_interior_update_wait,
bch2_btree_interior_updates_nr_pending(c) <
nr_pending_updates);
goto again;
}
}
return 0;
@ -2087,8 +2154,8 @@ void bch2_fs_allocator_init(struct bch_fs *c)
mutex_init(&c->write_points_hash_lock);
spin_lock_init(&c->freelist_lock);
bch2_prio_timer_init(c, READ);
bch2_prio_timer_init(c, WRITE);
bch2_bucket_clock_init(c, READ);
bch2_bucket_clock_init(c, WRITE);
/* open bucket 0 is a sentinal NULL: */
spin_lock_init(&c->open_buckets[0].lock);

View File

@ -9,6 +9,14 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_alloc_ops (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@ -30,6 +38,8 @@ enum bucket_alloc_ret {
NO_DEVICES = -3, /* -EROFS */
};
long bch2_bucket_alloc_new_fs(struct bch_dev *);
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
@ -127,6 +137,4 @@ int bch2_alloc_write(struct bch_fs *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_init(struct bch_fs *);
extern const struct bkey_ops bch2_bkey_alloc_ops;
#endif /* _BCACHEFS_ALLOC_H */

View File

@ -8,7 +8,7 @@
#include "fifo.h"
/* There's two of these clocks, one for reads and one for writes: */
struct prio_clock {
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
@ -23,7 +23,7 @@ struct prio_clock {
* consistent.
*/
u16 hand;
u16 min_prio;
u16 max_last_io;
int rw;
@ -80,6 +80,7 @@ struct write_point_specifier {
struct alloc_heap_entry {
size_t bucket;
size_t nr;
unsigned long key;
};

View File

@ -384,7 +384,7 @@ struct bch_dev {
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
spinlock_t freelist_lock;
unsigned nr_invalidated;
size_t nr_invalidated;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
@ -392,7 +392,7 @@ struct bch_dev {
size_t fifo_last_bucket;
/* last calculated minimum prio */
u16 min_prio[2];
u16 max_last_bucket_io[2];
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
@ -431,11 +431,11 @@ struct bch_dev {
*/
enum {
/* startup: */
BCH_FS_BRAND_NEW_FS,
BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOCATOR_STARTED,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
/* shutdown: */
BCH_FS_EMERGENCY_RO,
@ -519,8 +519,7 @@ struct bch_fs {
u64 features;
} sb;
struct bch_sb *disk_sb;
unsigned disk_sb_order;
struct bch_sb_handle disk_sb;
unsigned short block_bits; /* ilog2(block_size) */
@ -595,7 +594,7 @@ struct bch_fs {
* those together consistently we keep track of the smallest nonzero
* priority of any bucket.
*/
struct prio_clock prio_clock[2];
struct bucket_clock bucket_clock[2];
struct io_clock io_clock[2];

View File

@ -955,8 +955,9 @@ struct bch_disk_group {
__le64 flags[2];
};
LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
struct bch_sb_field_disk_groups {
struct bch_sb_field field;

View File

@ -10,20 +10,20 @@
#include "quota.h"
#include "xattr.h"
const struct bkey_ops *bch2_bkey_ops[] = {
[BKEY_TYPE_EXTENTS] = &bch2_bkey_extent_ops,
[BKEY_TYPE_INODES] = &bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops,
[BKEY_TYPE_QUOTAS] = &bch2_bkey_quota_ops,
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
const struct bkey_ops bch2_bkey_ops[] = {
[BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops,
[BKEY_TYPE_INODES] = bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
[BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
[BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
};
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
const struct bkey_ops *ops = &bch2_bkey_ops[type];
switch (k.k->type) {
case KEY_TYPE_DELETED:
@ -51,7 +51,7 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
const struct bkey_ops *ops = &bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
@ -100,7 +100,7 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
enum bkey_type type = btree_node_type(b);
const struct bkey_ops *ops = bch2_bkey_ops[type];
const struct bkey_ops *ops = &bch2_bkey_ops[type];
const char *invalid;
BUG_ON(!k.k->u64s);
@ -141,7 +141,7 @@ int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
const struct bkey_ops *ops = &bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
switch (k.k->type) {
@ -182,7 +182,7 @@ void bch2_bkey_swab(enum bkey_type type,
const struct bkey_format *f,
struct bkey_packed *k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
const struct bkey_ops *ops = &bch2_bkey_ops[type];
bch2_bkey_swab_key(f, k);

View File

@ -81,6 +81,6 @@ int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
extern const struct bkey_ops *bch2_bkey_ops[];
extern const struct bkey_ops bch2_bkey_ops[];
#endif /* _BCACHEFS_BKEY_METHODS_H */

View File

@ -18,6 +18,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
#include <linux/slab.h>
@ -317,7 +318,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
unsigned i;
u64 b;
lockdep_assert_held(&c->sb_lock);
if (c)
lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) {
u64 offset = le64_to_cpu(layout->sb_offset[i]);
@ -331,7 +333,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
BCH_DATA_SB, flags);
}
spin_lock(&c->journal.lock);
if (c)
spin_lock(&c->journal.lock);
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
@ -340,7 +343,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
gc_phase(GC_PHASE_SB), flags);
}
spin_unlock(&c->journal.lock);
if (c)
spin_unlock(&c->journal.lock);
}
static void bch2_mark_superblocks(struct bch_fs *c)
@ -1034,8 +1038,8 @@ static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
int ret;
mutex_lock(&c->sb_lock);
if (!bch2_sb_get_replicas(c->disk_sb)) {
if (BCH_SB_INITIALIZED(c->disk_sb))
if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
if (BCH_SB_INITIALIZED(c->disk_sb.sb))
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}

View File

@ -1290,16 +1290,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
struct bkey_s_c k;
k = bch2_btree_iter_peek_slot(iter);
if (btree_iter_err(k))
return k;
}
iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
/*
* XXX: when we just need to relock we should be able to avoid
* calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
* for that to work
*/
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
return bch2_btree_iter_peek_slot(iter);
}
if (!bkey_deleted(&iter->k))
__btree_iter_advance(&iter->l[0]);
@ -1318,6 +1321,8 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
iter->c = c;
iter->pos = pos;
bkey_init(&iter->k);
iter->k.p = pos;
iter->flags = flags;
iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
iter->btree_id = btree_id;
@ -1330,6 +1335,10 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
iter->l[iter->level].b = BTREE_ITER_NOT_END;
iter->next = iter;
if (unlikely((flags & BTREE_ITER_IS_EXTENTS) &&
!bkey_cmp(pos, POS_MAX)))
iter->uptodate = BTREE_ITER_END;
prefetch(c->btree_roots[btree_id].b);
}

View File

@ -231,6 +231,20 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
return __btree_iter_cmp(l->btree_id, l->pos, r);
}
/*
* Unlocks before scheduling
* Note: does not revalidate iterator
*/
static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
{
if (need_resched()) {
bch2_btree_iter_unlock(iter);
schedule();
} else if (race_fault()) {
bch2_btree_iter_unlock(iter);
}
}
#define __for_each_btree_node(_iter, _c, _btree_id, _start, \
_locks_want, _depth, _flags, _b) \
for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
@ -253,6 +267,8 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
unsigned flags)
{
bch2_btree_iter_cond_resched(iter);
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_next_slot(iter)
: bch2_btree_iter_next(iter);
@ -275,18 +291,4 @@ static inline int btree_iter_err(struct bkey_s_c k)
return PTR_ERR_OR_ZERO(k.k);
}
/*
* Unlocks before scheduling
* Note: does not revalidate iterator
*/
static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
{
if (need_resched()) {
bch2_btree_iter_unlock(iter);
schedule();
} else if (race_fault()) {
bch2_btree_iter_unlock(iter);
}
}
#endif /* _BCACHEFS_BTREE_ITER_H */

View File

@ -299,7 +299,7 @@ static inline enum bkey_type btree_node_type(struct btree *b)
static inline const struct bkey_ops *btree_node_ops(struct btree *b)
{
return bch2_bkey_ops[btree_node_type(b)];
return &bch2_bkey_ops[btree_node_type(b)];
}
static inline bool btree_node_has_ptrs(struct btree *b)

View File

@ -13,6 +13,7 @@
#include "extents.h"
#include "journal.h"
#include "keylist.h"
#include "replicas.h"
#include "super-io.h"
#include <linux/random.h>
@ -2116,3 +2117,16 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
return out - buf;
}
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
{
size_t ret = 0;
struct list_head *i;
mutex_lock(&c->btree_interior_update_lock);
list_for_each(i, &c->btree_interior_update_list)
ret++;
mutex_unlock(&c->btree_interior_update_lock);
return ret;
}

View File

@ -343,4 +343,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */

View File

@ -443,8 +443,20 @@ split:
* potentially blocks the allocator:
*/
ret = bch2_btree_split_leaf(c, split, trans->flags);
/*
* This can happen when we insert part of an extent - with an update
* with multiple keys, we don't want to redo the entire update - that's
* just too confusing:
*/
if (!ret &&
(trans->flags & BTREE_INSERT_ATOMIC) &&
trans->did_work)
ret = -EINTR;
if (ret)
goto err;
/*
* if the split didn't have to drop locks the insert will still be
* atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()

View File

@ -309,7 +309,7 @@ static bool bucket_became_unavailable(struct bch_fs *c,
{
return is_available_bucket(old) &&
!is_available_bucket(new) &&
c && c->gc_pos.phase == GC_PHASE_DONE;
(!c || c->gc_pos.phase == GC_PHASE_DONE);
}
void bch2_fs_usage_apply(struct bch_fs *c,
@ -351,12 +351,16 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
{
struct bch_dev_usage *dev_usage;
lockdep_assert_held(&c->usage_lock);
if (c)
lockdep_assert_held(&c->usage_lock);
bch2_fs_inconsistent_on(old.data_type && new.data_type &&
old.data_type != new.data_type, c,
if (old.data_type && new.data_type &&
old.data_type != new.data_type) {
BUG_ON(!c);
bch2_fs_inconsistent(c,
"different types of data in same bucket: %u, %u",
old.data_type, new.data_type);
}
dev_usage = this_cpu_ptr(ca->usage_percpu);
@ -466,21 +470,29 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!type);
lg_local_lock(&c->usage_lock);
g = bucket(ca, b);
if (likely(c)) {
lg_local_lock(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos)) {
lg_local_unlock(&c->usage_lock);
return;
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos)) {
lg_local_unlock(&c->usage_lock);
return;
}
}
preempt_disable();
g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, g, new, ({
saturated_add(ca, new.dirty_sectors, sectors,
GC_MAX_SECTORS_USED);
new.data_type = type;
}));
lg_local_unlock(&c->usage_lock);
preempt_enable();
if (likely(c))
lg_local_unlock(&c->usage_lock);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
@ -859,9 +871,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bch2_copygc_stop(ca);
down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
lg_global_lock(&c->usage_lock);
if (resize) {
down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
lg_global_lock(&c->usage_lock);
}
old_buckets = bucket_array(ca);
@ -885,7 +899,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_dirty, buckets_dirty);
lg_global_unlock(&c->usage_lock);
if (resize)
lg_global_unlock(&c->usage_lock);
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++) {
@ -904,8 +919,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
nbuckets = ca->mi.nbuckets;
up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
if (resize) {
up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
}
if (start_copygc &&
bch2_copygc_start(c, ca))

View File

@ -31,6 +31,7 @@
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
{
return rcu_dereference_check(ca->buckets,
!ca->fs ||
lockdep_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
@ -47,7 +48,12 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand;
bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
}
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
{
return c->bucket_clock[rw].hand - g->io_time[rw];
}
/*

View File

@ -31,12 +31,12 @@ struct bucket_mark {
};
struct bucket {
u16 prio[2];
union {
struct bucket_mark _mark;
const struct bucket_mark mark;
};
u16 io_time[2];
};
struct bucket_array {
@ -85,8 +85,9 @@ struct disk_reservation {
};
struct copygc_heap_entry {
u8 gen;
u32 sectors;
u64 offset;
struct bucket_mark mark;
};
typedef HEAP(struct copygc_heap_entry) copygc_heap;

View File

@ -372,6 +372,9 @@ static long bch2_ioctl_usage(struct bch_fs *c,
unsigned i, j;
int ret;
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
@ -460,7 +463,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
sb = ca->disk_sb.sb;
} else {
sb = c->disk_sb;
sb = c->disk_sb.sb;
}
if (vstruct_bytes(sb) > arg.size) {
@ -535,13 +538,22 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* ioctls that do require admin cap: */
switch (cmd) {
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
case BCH_IOCTL_STOP:
return bch2_ioctl_stop(c);
case BCH_IOCTL_READ_SUPER:
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
case BCH_IOCTL_DISK_GET_IDX:
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
}
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
/* ioctls that do require admin cap: */
switch (cmd) {
case BCH_IOCTL_DISK_ADD:
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
case BCH_IOCTL_DISK_REMOVE:
@ -554,10 +566,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
case BCH_IOCTL_DATA:
BCH_IOCTL(data, struct bch_ioctl_data);
case BCH_IOCTL_READ_SUPER:
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
case BCH_IOCTL_DISK_GET_IDX:
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);

View File

@ -569,7 +569,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
if (!bch2_key_is_encrypted(&sb_key))
goto out;
ret = bch2_request_key(c->disk_sb, &user_key);
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
bch_err(c, "error requesting encryption key: %i", ret);
goto err;
@ -623,7 +623,7 @@ int bch2_disable_encryption(struct bch_fs *c)
mutex_lock(&c->sb_lock);
crypt = bch2_sb_get_crypt(c->disk_sb);
crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
goto out;
@ -639,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c)
crypt->key.magic = BCH_KEY_MAGIC;
crypt->key.key = key;
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
@ -657,7 +657,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
mutex_lock(&c->sb_lock);
/* Do we already have an encryption key? */
if (bch2_sb_get_crypt(c->disk_sb))
if (bch2_sb_get_crypt(c->disk_sb.sb))
goto err;
ret = bch2_alloc_ciphers(c);
@ -668,7 +668,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
get_random_bytes(&key.key, sizeof(key.key));
if (keyed) {
ret = bch2_request_key(c->disk_sb, &user_key);
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
bch_err(c, "error requesting encryption key: %i", ret);
goto err;
@ -685,7 +685,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
if (ret)
goto err;
crypt = bch2_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
if (!crypt) {
ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
goto err;
@ -694,7 +694,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
crypt->key = key;
/* write superblock */
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
@ -728,7 +728,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
goto out;
}
crypt = bch2_sb_get_crypt(c->disk_sb);
crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
goto out;

View File

@ -117,6 +117,7 @@ static const unsigned bch_crc_bytes[] = {
[BCH_CSUM_CHACHA20_POLY1305_128] = 16,
};
/* returns true if not equal */
static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
{
/*

View File

@ -3,7 +3,7 @@
#include "util.h"
#define NR_IO_TIMERS 8
#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
/*
* Clocks/timers in units of sectors of IO:

View File

@ -500,7 +500,7 @@ int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
return ret;
}
c->disk_sb->features[0] |= cpu_to_le64(f);
c->disk_sb.sb->features[0] |= cpu_to_le64(f);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);

View File

@ -212,17 +212,20 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
return i->ret;
for_each_btree_key(&iter, i->c, i->id, i->from,
BTREE_ITER_PREFETCH, k) {
i->from = iter.pos;
bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
k = bch2_btree_iter_peek(&iter);
while (k.k && !(err = btree_iter_err(k))) {
bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
i->buf, sizeof(i->buf), k);
i->buf, sizeof(i->buf), k);
i->bytes = strlen(i->buf);
BUG_ON(i->bytes >= PAGE_SIZE);
i->buf[i->bytes] = '\n';
i->bytes++;
k = bch2_btree_iter_next(&iter);
i->from = iter.pos;
err = flush_buf(i);
if (err)
break;
@ -230,7 +233,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
break;
}
err = bch2_btree_iter_unlock(&iter) ?: err;
bch2_btree_iter_unlock(&iter);
return err < 0 ? err : i->ret;
}

View File

@ -79,8 +79,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.cmp_bkey = dirent_cmp_bkey,
};
static const char *bch2_dirent_invalid(const struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
unsigned len;
@ -116,8 +115,8 @@ static const char *bch2_dirent_invalid(const struct bch_fs *c,
}
}
static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_dirent_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
size_t n = 0;
@ -136,11 +135,6 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
}
}
const struct bkey_ops bch2_bkey_dirent_ops = {
.key_invalid = bch2_dirent_invalid,
.val_to_text = bch2_dirent_to_text,
};
static struct bkey_i_dirent *dirent_create_key(u8 type,
const struct qstr *name, u64 dst)
{

View File

@ -4,7 +4,14 @@
#include "str_hash.h"
extern const struct bch_hash_desc bch2_dirent_hash_desc;
extern const struct bkey_ops bch2_bkey_dirent_ops;
const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_dirent_ops (struct bkey_ops) { \
.key_invalid = bch2_dirent_invalid, \
.val_to_text = bch2_dirent_to_text, \
}
struct qstr;
struct file;

462
libbcachefs/disk_groups.c Normal file
View File

@ -0,0 +1,462 @@
#include "bcachefs.h"
#include "disk_groups.h"
#include "super-io.h"
#include <linux/sort.h>
static int group_cmp(const void *_l, const void *_r)
{
const struct bch_disk_group *l = _l;
const struct bch_disk_group *r = _r;
return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
(BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
strncmp(l->label, r->label, sizeof(l->label));
}
const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g, *sorted = NULL;
struct bch_sb_field_members *mi;
struct bch_member *m;
unsigned i, nr_groups, len;
const char *err = NULL;
mi = bch2_sb_get_members(sb);
groups = bch2_sb_get_disk_groups(sb);
nr_groups = disk_groups_nr(groups);
for (m = mi->members;
m < mi->members + sb->nr_devices;
m++) {
unsigned g;
if (!BCH_MEMBER_GROUP(m))
continue;
g = BCH_MEMBER_GROUP(m) - 1;
if (g >= nr_groups ||
BCH_GROUP_DELETED(&groups->entries[g]))
return "disk has invalid group";
}
if (!nr_groups)
return NULL;
for (g = groups->entries;
g < groups->entries + nr_groups;
g++) {
if (BCH_GROUP_DELETED(g))
continue;
len = strnlen(g->label, sizeof(g->label));
if (!len) {
err = "group with empty label";
goto err;
}
}
sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
if (!sorted)
return "cannot allocate memory";
memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
for (i = 0; i + 1 < nr_groups; i++)
if (!BCH_GROUP_DELETED(sorted + i) &&
!group_cmp(sorted + i, sorted + i + 1)) {
err = "duplicate groups";
goto err;
}
err = NULL;
err:
kfree(sorted);
return err;
}
static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
struct bch_sb *sb,
struct bch_sb_field *f)
{
char *out = buf, *end = buf + size;
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g;
unsigned nr_groups = disk_groups_nr(groups);
for (g = groups->entries;
g < groups->entries + nr_groups;
g++) {
if (g != groups->entries)
out += scnprintf(out, end - out, " ");
if (BCH_GROUP_DELETED(g))
out += scnprintf(out, end - out, "[deleted]");
else
out += scnprintf(out, end - out,
"[parent %llu name %s]",
BCH_GROUP_PARENT(g),
g->label);
}
return out - buf;
}
const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
.validate = bch2_sb_disk_groups_validate,
.to_text = bch2_sb_disk_groups_to_text
};
int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
{
struct bch_sb_field_members *mi;
struct bch_sb_field_disk_groups *groups;
struct bch_disk_groups_cpu *cpu_g, *old_g;
unsigned i, g, nr_groups;
lockdep_assert_held(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
groups = bch2_sb_get_disk_groups(c->disk_sb.sb);
nr_groups = disk_groups_nr(groups);
if (!groups)
return 0;
cpu_g = kzalloc(sizeof(*cpu_g) +
sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
if (!cpu_g)
return -ENOMEM;
cpu_g->nr = nr_groups;
for (i = 0; i < nr_groups; i++) {
struct bch_disk_group *src = &groups->entries[i];
struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
struct bch_member *m = mi->members + i;
struct bch_disk_group_cpu *dst =
&cpu_g->entries[BCH_MEMBER_GROUP(m)];
if (!bch2_member_exists(m))
continue;
g = BCH_MEMBER_GROUP(m);
while (g) {
dst = &cpu_g->entries[g - 1];
__set_bit(i, dst->devs.d);
g = dst->parent;
}
}
old_g = c->disk_groups;
rcu_assign_pointer(c->disk_groups, cpu_g);
if (old_g)
kfree_rcu(old_g, rcu);
return 0;
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_DEV: {
struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
return ca ? &ca->self : NULL;
}
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
return t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
}
default:
BUG();
}
}
static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
unsigned parent,
const char *name, unsigned namelen)
{
unsigned i, nr_groups = disk_groups_nr(groups);
if (!namelen || namelen > BCH_SB_LABEL_SIZE)
return -EINVAL;
for (i = 0; i < nr_groups; i++) {
struct bch_disk_group *g = groups->entries + i;
if (BCH_GROUP_DELETED(g))
continue;
if (!BCH_GROUP_DELETED(g) &&
BCH_GROUP_PARENT(g) == parent &&
strnlen(g->label, sizeof(g->label)) == namelen &&
!memcmp(name, g->label, namelen))
return i;
}
return -1;
}
static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
const char *name, unsigned namelen)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_get_disk_groups(sb->sb);
unsigned i, nr_groups = disk_groups_nr(groups);
struct bch_disk_group *g;
if (!namelen || namelen > BCH_SB_LABEL_SIZE)
return -EINVAL;
for (i = 0;
i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
i++)
;
if (i == nr_groups) {
unsigned u64s =
(sizeof(struct bch_sb_field_disk_groups) +
sizeof(struct bch_disk_group) * (nr_groups + 1)) /
sizeof(u64);
groups = bch2_sb_resize_disk_groups(sb, u64s);
if (!groups)
return -ENOSPC;
nr_groups = disk_groups_nr(groups);
}
BUG_ON(i >= nr_groups);
g = &groups->entries[i];
memcpy(g->label, name, namelen);
if (namelen < sizeof(g->label))
g->label[namelen] = '\0';
SET_BCH_GROUP_DELETED(g, 0);
SET_BCH_GROUP_PARENT(g, parent);
SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
return i;
}
int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_get_disk_groups(sb->sb);
int v = -1;
do {
const char *next = strchrnul(name, '.');
unsigned len = next - name;
if (*next == '.')
next++;
v = __bch2_disk_group_find(groups, v + 1, name, len);
name = next;
} while (*name && v >= 0);
return v;
}
int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
{
struct bch_sb_field_disk_groups *groups;
unsigned parent = 0;
int v = -1;
do {
const char *next = strchrnul(name, '.');
unsigned len = next - name;
if (*next == '.')
next++;
groups = bch2_sb_get_disk_groups(sb->sb);
v = __bch2_disk_group_find(groups, parent, name, len);
if (v < 0)
v = __bch2_disk_group_add(sb, parent, name, len);
if (v < 0)
return v;
parent = v + 1;
name = next;
} while (*name && v >= 0);
return v;
}
int bch2_disk_path_print(struct bch_sb_handle *sb,
char *buf, size_t len, unsigned v)
{
char *out = buf, *end = out + len;
struct bch_sb_field_disk_groups *groups =
bch2_sb_get_disk_groups(sb->sb);
struct bch_disk_group *g;
unsigned nr = 0;
u16 path[32];
while (1) {
if (nr == ARRAY_SIZE(path))
goto inval;
if (v >= disk_groups_nr(groups))
goto inval;
g = groups->entries + v;
if (BCH_GROUP_DELETED(g))
goto inval;
path[nr++] = v;
if (!BCH_GROUP_PARENT(g))
break;
v = BCH_GROUP_PARENT(g) - 1;
}
while (nr) {
unsigned b = 0;
v = path[--nr];
g = groups->entries + v;
if (end != out)
b = min_t(size_t, end - out,
strnlen(g->label, sizeof(g->label)));
memcpy(out, g->label, b);
if (b < end - out)
out[b] = '\0';
out += b;
if (nr)
out += scnprintf(out, end - out, ".");
}
return out - buf;
inval:
return scnprintf(buf, len, "invalid group %u", v);
}
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
int v = -1;
mutex_lock(&c->sb_lock);
if (!strlen(name) || !strcmp(name, "none"))
goto write_sb;
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
if (v < 0) {
mutex_unlock(&c->sb_lock);
return v;
}
write_sb:
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
{
struct bch_dev *ca;
int g;
if (!strlen(buf) || !strcmp(buf, "none")) {
*v = 0;
return 0;
}
/* Is it a device? */
ca = bch2_dev_lookup(c, buf);
if (!IS_ERR(ca)) {
*v = dev_to_target(ca->dev_idx);
percpu_ref_put(&ca->ref);
return 0;
}
mutex_lock(&c->sb_lock);
g = bch2_disk_path_find(&c->disk_sb, buf);
mutex_unlock(&c->sb_lock);
if (g >= 0) {
*v = group_to_target(g);
return 0;
}
return -EINVAL;
}
int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
{
struct target t = target_decode(v);
int ret;
switch (t.type) {
case TARGET_NULL:
return scnprintf(buf, len, "none");
case TARGET_DEV: {
struct bch_dev *ca;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
char b[BDEVNAME_SIZE];
ret = scnprintf(buf, len, "/dev/%s",
bdevname(ca->disk_sb.bdev, b));
percpu_ref_put(&ca->io_ref);
} else if (ca) {
ret = scnprintf(buf, len, "offline device %u", t.dev);
} else {
ret = scnprintf(buf, len, "invalid device %u", t.dev);
}
rcu_read_unlock();
break;
}
case TARGET_GROUP:
mutex_lock(&c->sb_lock);
ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
mutex_unlock(&c->sb_lock);
break;
default:
BUG();
}
return ret;
}

99
libbcachefs/disk_groups.h Normal file
View File

@ -0,0 +1,99 @@
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
{
return groups
? (vstruct_end(&groups->field) -
(void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
: 0;
}
struct target {
enum {
TARGET_NULL,
TARGET_DEV,
TARGET_GROUP,
} type;
union {
unsigned dev;
unsigned group;
};
};
#define TARGET_DEV_START 1
#define TARGET_GROUP_START (256 + TARGET_DEV_START)
static inline u16 dev_to_target(unsigned dev)
{
return TARGET_DEV_START + dev;
}
static inline u16 group_to_target(unsigned group)
{
return TARGET_GROUP_START + group;
}
static inline struct target target_decode(unsigned target)
{
if (target >= TARGET_GROUP_START)
return (struct target) {
.type = TARGET_GROUP,
.group = target - TARGET_GROUP_START
};
if (target >= TARGET_DEV_START)
return (struct target) {
.type = TARGET_DEV,
.group = target - TARGET_DEV_START
};
return (struct target) { .type = TARGET_NULL };
}
static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_NULL:
return false;
case TARGET_DEV:
return ca->dev_idx == t.dev;
case TARGET_GROUP:
return ca->mi.group && ca->mi.group - 1 == t.group;
default:
BUG();
}
}
static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
{
bool ret;
rcu_read_lock();
ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
rcu_read_unlock();
return ret;
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
const char *bch2_sb_validate_disk_groups(struct bch_sb *,
struct bch_sb_field *);
#endif /* _BCACHEFS_DISK_GROUPS_H */

View File

@ -14,10 +14,12 @@
#include "checksum.h"
#include "debug.h"
#include "dirent.h"
#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "journal.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "util.h"
@ -25,9 +27,6 @@
#include <trace/events/bcachefs.h>
static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
struct bkey_i *, struct bkey_i *);
static void sort_key_next(struct btree_node_iter_large *iter,
struct btree *b,
struct btree_node_iter_set *i)
@ -160,9 +159,13 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
{
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr)
if (dev_in_target(c->devs[ptr->dev], target))
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (dev_in_target(ca, target) &&
(!ptr->cached || !ptr_stale(ca, ptr)))
return ptr;
}
return NULL;
}
@ -356,11 +359,25 @@ restart_narrow_pointers:
return true;
}
/* returns true if not equal */
static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
struct bch_extent_crc_unpacked r)
{
return (l.csum_type != r.csum_type ||
l.compression_type != r.compression_type ||
l.compressed_size != r.compressed_size ||
l.uncompressed_size != r.uncompressed_size ||
l.offset != r.offset ||
l.live_size != r.live_size ||
l.nonce != r.nonce ||
bch2_crc_cmp(l.csum, r.csum));
}
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
{
union bch_extent_entry *entry = e.v->start;
union bch_extent_crc *crc, *prev = NULL;
struct bch_extent_crc_unpacked u, prev_u;
struct bch_extent_crc_unpacked u, prev_u = { 0 };
while (entry != extent_entry_last(e)) {
union bch_extent_entry *next = extent_entry_next(entry);
@ -382,7 +399,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
goto drop;
}
if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
/* identical to previous crc entry: */
goto drop;
}
@ -439,13 +456,12 @@ static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
bch2_extent_drop_redundant_crcs(e);
}
static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
struct bkey_s k)
bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
{
return bch2_extent_normalize(c, k);
}
static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
{
switch (k->type) {
case BCH_EXTENT:
@ -628,8 +644,7 @@ use:
/* Btree ptrs */
static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (bkey_extent_is_cached(k.k))
return "cached";
@ -671,8 +686,8 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
}
}
static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
struct bkey_s_c k)
void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
struct bkey_s_c k)
{
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
@ -727,8 +742,8 @@ err:
mark.gen, (unsigned) mark.counter);
}
static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
const char *invalid;
@ -756,13 +771,6 @@ bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
return pick;
}
const struct bkey_ops bch2_bkey_btree_ops = {
.key_invalid = bch2_btree_ptr_invalid,
.key_debugcheck = btree_ptr_debugcheck,
.val_to_text = bch2_btree_ptr_to_text,
.swab = bch2_ptr_swab,
};
/* Extents */
static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
@ -1436,7 +1444,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
}
static enum btree_insert_ret
bch2_delete_fixup_extent(struct extent_insert_state *s)
__bch2_delete_fixup_extent(struct extent_insert_state *s)
{
struct bch_fs *c = s->trans->c;
struct btree_iter *iter = s->insert->iter;
@ -1450,8 +1458,7 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
s->whiteout = *insert;
s->do_journal = false;
s->whiteout = *insert;
while (bkey_cmp(s->committed, insert->k.p) < 0 &&
(ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
@ -1474,12 +1481,12 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
overlap = bch2_extent_overlap(&insert->k, k.k);
ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
if (ret != BTREE_INSERT_OK)
goto stop;
if (ret)
break;
ret = extent_insert_advance_pos(s, k.s_c);
if (ret)
goto stop;
break;
s->do_journal = true;
@ -1520,25 +1527,65 @@ next:
bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
}
if (ret == BTREE_INSERT_OK &&
bkey_cmp(s->committed, insert->k.p) < 0)
ret = extent_insert_advance_pos(s, bkey_s_c_null);
stop:
extent_insert_committed(s);
return ret;
}
bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
gc_pos_btree_node(b));
static enum btree_insert_ret
__bch2_insert_fixup_extent(struct extent_insert_state *s)
{
struct btree_iter *iter = s->insert->iter;
struct btree_iter_level *l = &iter->l[0];
struct btree *b = l->b;
struct btree_node_iter *node_iter = &l->iter;
struct bkey_packed *_k;
struct bkey unpacked;
struct bkey_i *insert = s->insert->k;
enum btree_insert_ret ret = BTREE_INSERT_OK;
EBUG_ON(bkey_cmp(iter->pos, s->committed));
EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
!!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
while (bkey_cmp(s->committed, insert->k.p) < 0 &&
(ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
(_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
struct bset_tree *t = bch2_bkey_to_bset(b, _k);
struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
enum bch_extent_overlap overlap;
bch2_cut_front(iter->pos, insert);
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
break;
EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
overlap = bch2_extent_overlap(&insert->k, k.k);
ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
if (ret)
break;
if (!k.k->size)
goto squash;
/*
* Only call advance pos & call hook for nonzero size extents:
*/
ret = extent_insert_advance_pos(s, k.s_c);
if (ret)
break;
if (k.k->size &&
(k.k->needs_whiteout || bset_written(b, bset(b, t))))
insert->k.needs_whiteout = true;
if (overlap == BCH_EXTENT_OVERLAP_ALL &&
bkey_whiteout(k.k) &&
k.k->needs_whiteout) {
unreserve_whiteout(b, t, _k);
_k->needs_whiteout = false;
}
squash:
ret = extent_squash(s, insert, t, _k, k, overlap);
if (ret != BTREE_INSERT_OK)
break;
}
return ret;
}
@ -1590,9 +1637,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
struct btree_iter *iter = insert->iter;
struct btree_iter_level *l = &iter->l[0];
struct btree *b = l->b;
struct btree_node_iter *node_iter = &l->iter;
struct bkey_packed *_k;
struct bkey unpacked;
enum btree_insert_ret ret = BTREE_INSERT_OK;
struct extent_insert_state s = {
@ -1605,9 +1649,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
EBUG_ON(iter->level);
EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
if (s.deleting)
return bch2_delete_fixup_extent(&s);
/*
* As we process overlapping extents, we advance @iter->pos both to
* signal to our caller (btree_insert_key()) how much of @insert->k has
@ -1616,67 +1657,32 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
*/
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
if (!s.deleting &&
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
bkey_start_offset(&insert->k->k),
insert->k->k.size);
while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
(ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
(_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
struct bset_tree *t = bch2_bkey_to_bset(b, _k);
struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
enum bch_extent_overlap overlap;
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
break;
overlap = bch2_extent_overlap(&insert->k->k, k.k);
ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
if (ret != BTREE_INSERT_OK)
goto stop;
if (!k.k->size)
goto squash;
/*
* Only call advance pos & call hook for nonzero size extents:
*/
ret = extent_insert_advance_pos(&s, k.s_c);
if (ret != BTREE_INSERT_OK)
goto stop;
if (k.k->size &&
(k.k->needs_whiteout || bset_written(b, bset(b, t))))
insert->k->k.needs_whiteout = true;
if (overlap == BCH_EXTENT_OVERLAP_ALL &&
bkey_whiteout(k.k) &&
k.k->needs_whiteout) {
unreserve_whiteout(b, t, _k);
_k->needs_whiteout = false;
}
squash:
ret = extent_squash(&s, insert->k, t, _k, k, overlap);
if (ret != BTREE_INSERT_OK)
goto stop;
}
ret = !s.deleting
? __bch2_insert_fixup_extent(&s)
: __bch2_delete_fixup_extent(&s);
if (ret == BTREE_INSERT_OK &&
bkey_cmp(s.committed, insert->k->k.p) < 0)
ret = extent_insert_advance_pos(&s, bkey_s_c_null);
stop:
extent_insert_committed(&s);
if (s.deleting)
bch2_cut_front(iter->pos, insert->k);
/*
* Subtract any remaining sectors from @insert, if we bailed out early
* and didn't fully insert @insert:
*/
if (insert->k->k.size &&
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
if (!s.deleting &&
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
insert->k->k.size)
bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
bkey_start_offset(&insert->k->k),
insert->k->k.size);
@ -1692,13 +1698,13 @@ stop:
if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
"ret %u insert->k.size %u", ret, insert->k->k.size);
return ret;
}
static const char *bch2_extent_invalid(const struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
return "value too big";
@ -1865,8 +1871,7 @@ bad_ptr:
return;
}
static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
struct bkey_s_c k)
void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
switch (k.k->type) {
case BCH_EXTENT:
@ -1880,8 +1885,8 @@ static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
}
}
static void bch2_extent_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_extent_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
const char *invalid;
@ -1963,7 +1968,7 @@ void bch2_extent_crc_append(struct bkey_i_extent *e,
extent_for_each_crc(extent_i_to_s(e), crc, i)
;
if (!memcmp(&crc, &new, sizeof(crc)))
if (!bch2_crc_unpacked_cmp(crc, new))
return;
bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
@ -2089,9 +2094,8 @@ void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
}
}
static enum merge_result bch2_extent_merge(struct bch_fs *c,
struct btree *bk,
struct bkey_i *l, struct bkey_i *r)
enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
struct bkey_i *l, struct bkey_i *r)
{
struct bkey_s_extent el, er;
union bch_extent_entry *en_l, *en_r;
@ -2410,13 +2414,3 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
return ret;
}
const struct bkey_ops bch2_bkey_extent_ops = {
.key_invalid = bch2_extent_invalid,
.key_debugcheck = bch2_extent_debugcheck,
.val_to_text = bch2_extent_to_text,
.swab = bch2_ptr_swab,
.key_normalize = bch2_ptr_normalize,
.key_merge = bch2_extent_merge,
.is_extents = true,
};

View File

@ -15,6 +15,36 @@ struct extent_insert_hook;
struct bch_devs_mask;
union bch_extent_crc;
const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
struct bkey_s_c);
void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
#define bch2_bkey_btree_ops (struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_invalid, \
.key_debugcheck = bch2_btree_ptr_debugcheck, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
}
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
struct bkey_i *, struct bkey_i *);
#define bch2_bkey_extent_ops (struct bkey_ops) { \
.key_invalid = bch2_extent_invalid, \
.key_debugcheck = bch2_extent_debugcheck, \
.val_to_text = bch2_extent_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_ptr_normalize, \
.key_merge = bch2_extent_merge, \
.is_extents = true, \
}
struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
struct btree *,
struct btree_node_iter_large *);
@ -23,9 +53,6 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct btree *,
struct btree_node_iter_large *);
extern const struct bkey_ops bch2_bkey_btree_ops;
extern const struct bkey_ops bch2_bkey_extent_ops;
struct extent_pick_ptr
bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
struct bch_devs_mask *avoid);

View File

@ -468,7 +468,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
}
BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
BUG_ON(!ret != !k->k.size);
if (WARN_ONCE(!ret != !k->k.size,
"ret %i k->size %u", ret, k->k.size))
ret = k->k.size ? -EINTR : 0;
err:
if (ret == -EINTR)
continue;

View File

@ -175,8 +175,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
return 0;
}
static const char *bch2_inode_invalid(const struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (k.k->p.offset)
return "nonzero offset";
@ -224,8 +223,8 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
}
}
static void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end = out + size;
struct bkey_s_c_inode inode;
@ -247,11 +246,6 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
}
}
const struct bkey_ops bch2_bkey_inode_ops = {
.key_invalid = bch2_inode_invalid,
.val_to_text = bch2_inode_to_text,
};
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)

View File

@ -5,7 +5,13 @@
#include <linux/math64.h>
extern const struct bkey_ops bch2_bkey_inode_ops;
const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_inode_ops (struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
}
struct bch_inode_unpacked {
u64 bi_inum;

View File

@ -20,6 +20,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "tier.h"
@ -196,8 +197,6 @@ static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
BUG_ON(!(op->flags & BCH_WRITE_DONE));
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&op->c->journal);
@ -205,7 +204,6 @@ static void bch2_write_done(struct closure *cl)
bch2_disk_reservation_put(op->c, &op->res);
percpu_ref_put(&op->c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
closure_return(cl);
}
@ -232,9 +230,8 @@ int bch2_write_index_default(struct bch_write_op *op)
/**
* bch_write_index - after a write, update index to point to new data
*/
static void bch2_write_index(struct closure *cl)
static void __bch2_write_index(struct bch_write_op *op)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_s_extent e;
@ -242,8 +239,6 @@ static void bch2_write_index(struct closure *cl)
struct bkey_i *src, *dst = keys->keys, *n, *k;
int ret;
op->flags |= BCH_WRITE_LOOPED;
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
bkey_copy(dst, src);
@ -292,9 +287,19 @@ static void bch2_write_index(struct closure *cl)
}
out:
bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
return;
err:
keys->top = keys->keys;
op->error = ret;
goto out;
}
if (!(op->flags & BCH_WRITE_DONE))
continue_at(cl, __bch2_write, op->io_wq);
static void bch2_write_index(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
__bch2_write_index(op);
if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
bch2_journal_flush_seq_async(&c->journal,
@ -304,12 +309,6 @@ out:
} else {
continue_at_nobarrier(cl, bch2_write_done, NULL);
}
return;
err:
keys->top = keys->keys;
op->error = ret;
op->flags |= BCH_WRITE_DONE;
goto out;
}
static void bch2_write_endio(struct bio *bio)
@ -730,18 +729,18 @@ static void __bch2_write(struct closure *cl)
struct bch_fs *c = op->c;
struct write_point *wp;
int ret;
again:
do {
/* +1 for possible cache device: */
if (op->open_buckets_nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets))
continue_at(cl, bch2_write_index, index_update_wq(op));
goto flush_io;
if (bch2_keylist_realloc(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_EXTENT_U64s_MAX))
continue_at(cl, bch2_write_index, index_update_wq(op));
goto flush_io;
wp = bch2_alloc_sectors_start(c,
op->target,
@ -760,33 +759,7 @@ static void __bch2_write(struct closure *cl)
goto err;
}
/*
* If we already have some keys, must insert them first
* before allocating another open bucket. We only hit
* this case if open_bucket_nr > 1.
*/
if (!bch2_keylist_empty(&op->insert_keys))
continue_at(cl, bch2_write_index,
index_update_wq(op));
/*
* If we've looped, we're running out of a workqueue -
* not the bch2_write() caller's context - and we don't
* want to block the workqueue:
*/
if (op->flags & BCH_WRITE_LOOPED)
continue_at(cl, __bch2_write, op->io_wq);
/*
* Otherwise, we do want to block the caller on alloc
* failure instead of letting it queue up more and more
* writes:
* XXX: this technically needs a try_to_freeze() -
* except that that's not safe because caller may have
* issued other IO... hmm..
*/
closure_sync(cl);
continue;
goto flush_io;
}
ret = bch2_write_extent(op, wp);
@ -802,28 +775,24 @@ static void __bch2_write(struct closure *cl)
goto err;
} while (ret);
op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
err:
/*
* Right now we can only error here if we went RO - the
* allocation failed, but we already checked for -ENOSPC when we
* got our reservation.
*
* XXX capacity might have changed, but we don't check for that
* yet:
*/
op->error = ret;
op->flags |= BCH_WRITE_DONE;
/*
* No reason not to insert keys for whatever data was successfully
* written (especially for a cmpxchg operation that's moving data
* around)
*/
continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
? bch2_write_index
: bch2_write_done, index_update_wq(op));
flush_io:
closure_sync(cl);
if (!bch2_keylist_empty(&op->insert_keys)) {
__bch2_write_index(op);
if (op->error)
continue_at_nobarrier(cl, bch2_write_done, NULL);
}
goto again;
}
/**
@ -969,7 +938,7 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
if (percpu_ref_is_dying(&c->writes))
return false;
return bch2_extent_has_target(c, e, target);
return bch2_extent_has_target(c, e, target) == NULL;
}
/* Read */

View File

@ -36,8 +36,6 @@ enum bch_write_flags {
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
BCH_WRITE_DONE = (1 << 10),
BCH_WRITE_LOOPED = (1 << 11),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)

View File

@ -19,6 +19,7 @@
#include "io.h"
#include "keylist.h"
#include "journal.h"
#include "replicas.h"
#include "super-io.h"
#include "vstructs.h"
@ -1582,40 +1583,19 @@ err:
return ret;
}
/*
* Allocate more journal space at runtime - not currently making use if it, but
* the code works:
*/
static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
unsigned nr)
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bool new_fs, struct closure *cl)
{
struct journal *j = &c->journal;
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets;
struct disk_reservation disk_res = { 0, 0 };
struct closure cl;
u64 *new_bucket_seq = NULL, *new_buckets = NULL;
int ret = 0;
closure_init_stack(&cl);
/* don't handle reducing nr of buckets yet: */
if (nr <= ja->nr)
return 0;
/*
* note: journal buckets aren't really counted as _sectors_ used yet, so
* we don't need the disk reservation to avoid the BUG_ON() in buckets.c
* when space used goes up without a reservation - but we do need the
* reservation to ensure we'll actually be able to allocate:
*/
if (bch2_disk_reservation_get(c, &disk_res,
bucket_to_sector(ca, nr - ja->nr), 1, 0))
return -ENOSPC;
mutex_lock(&c->sb_lock);
ret = -ENOMEM;
new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
@ -1627,29 +1607,41 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
if (!journal_buckets)
goto err;
spin_lock(&j->lock);
if (c)
spin_lock(&c->journal.lock);
memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq);
spin_unlock(&j->lock);
if (c)
spin_unlock(&c->journal.lock);
while (ja->nr < nr) {
struct open_bucket *ob;
size_t bucket;
int ob_idx;
struct open_bucket *ob = NULL;
long bucket;
ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
if (ob_idx < 0) {
if (!closure_wait(&c->freelist_wait, &cl))
closure_sync(&cl);
continue;
if (new_fs) {
bucket = bch2_bucket_alloc_new_fs(ca);
if (bucket < 0) {
ret = -ENOSPC;
goto err;
}
} else {
int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
if (ob_idx < 0) {
ret = cl ? -EAGAIN : -ENOSPC;
goto err;
}
ob = c->open_buckets + ob_idx;
bucket = sector_to_bucket(ca, ob->ptr.offset);
}
ob = c->open_buckets + ob_idx;
bucket = sector_to_bucket(ca, ob->ptr.offset);
if (c)
spin_lock(&c->journal.lock);
spin_lock(&j->lock);
__array_insert_item(ja->buckets, ja->nr, ja->last_idx);
__array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
__array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
@ -1664,34 +1656,77 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
ja->last_idx++;
}
ja->nr++;
spin_unlock(&j->lock);
if (c)
spin_unlock(&c->journal.lock);
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB), 0);
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
new_fs
? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
: 0);
bch2_open_bucket_put(c, ob);
if (!new_fs)
bch2_open_bucket_put(c, ob);
}
bch2_write_super(c);
ret = 0;
err:
mutex_unlock(&c->sb_lock);
kfree(new_bucket_seq);
kfree(new_buckets);
bch2_disk_reservation_put(c, &disk_res);
if (!ret)
bch2_dev_allocator_add(c, ca);
closure_sync(&cl);
return ret;
}
int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
/*
* Allocate more journal space at runtime - not currently making use if it, but
* the code works:
*/
int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
unsigned nr)
{
struct journal_device *ja = &ca->journal;
struct closure cl;
unsigned current_nr;
int ret;
closure_init_stack(&cl);
do {
struct disk_reservation disk_res = { 0, 0 };
closure_sync(&cl);
mutex_lock(&c->sb_lock);
current_nr = ja->nr;
/*
* note: journal buckets aren't really counted as _sectors_ used yet, so
* we don't need the disk reservation to avoid the BUG_ON() in buckets.c
* when space used goes up without a reservation - but we do need the
* reservation to ensure we'll actually be able to allocate:
*/
if (bch2_disk_reservation_get(c, &disk_res,
bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
mutex_unlock(&c->sb_lock);
return -ENOSPC;
}
ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
bch2_disk_reservation_put(c, &disk_res);
if (ja->nr != current_nr)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
} while (ret == -EAGAIN);
return ret;
}
int bch2_dev_journal_alloc(struct bch_dev *ca)
{
unsigned nr;
@ -1707,7 +1742,7 @@ int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
min(1 << 10,
(1 << 20) / ca->mi.bucket_size));
return bch2_set_nr_journal_buckets(c, ca, nr);
return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
}
/* Journalling */
@ -2320,8 +2355,8 @@ static void journal_write(struct closure *cl)
journal_write_compact(jset);
jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
jset->version = cpu_to_le32(BCACHE_JSET_VERSION);

View File

@ -400,7 +400,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *);
int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);

View File

@ -11,6 +11,7 @@
#include "keylist.h"
#include "migrate.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,

View File

@ -6,6 +6,7 @@
#include "inode.h"
#include "io.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
#include "keylist.h"

View File

@ -9,6 +9,7 @@
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
#include "extents.h"
#include "eytzinger.h"
#include "io.h"
@ -51,7 +52,7 @@ static inline int sectors_used_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
{
return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
return (l.sectors > r.sectors) - (l.sectors < r.sectors);
}
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
@ -78,7 +79,7 @@ static bool __copygc_pred(struct bch_dev *ca,
return (i >= 0 &&
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
ptr->gen == h->data[i].mark.gen);
ptr->gen == h->data[i].gen);
}
return false;
@ -154,8 +155,9 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
continue;
e = (struct copygc_heap_entry) {
.offset = bucket_to_sector(ca, b),
.mark = m
.gen = m.gen,
.sectors = bucket_sectors_used(m),
.offset = bucket_to_sector(ca, b),
};
heap_add_or_replace(h, e, -sectors_used_cmp);
}
@ -163,11 +165,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
up_read(&c->gc_lock);
for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += bucket_sectors_used(i->mark);
sectors_to_move += i->sectors;
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
sectors_to_move -= bucket_sectors_used(e.mark);
sectors_to_move -= e.sectors;
}
buckets_to_move = h->used;
@ -191,7 +193,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
size_t b = sector_to_bucket(ca, i->offset);
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
if (i->gen == m.gen && bucket_sectors_used(m)) {
sectors_not_moved += bucket_sectors_used(m);
buckets_not_moved++;
}
@ -284,7 +286,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
t = kthread_create(bch2_copygc_thread, ca,
"bch_copygc[%s]", ca->name);
if (IS_ERR(t))
return PTR_ERR(t);

View File

@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include "bcachefs.h"
#include "disk_groups.h"
#include "opts.h"
#include "super-io.h"
#include "util.h"

View File

@ -4,7 +4,22 @@
#include "quota.h"
#include "super-io.h"
static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
static const char *bch2_sb_validate_quota(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
if (vstruct_bytes(&q->field) != sizeof(*q))
return "invalid field quota: wrong size";
return NULL;
}
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
.validate = bch2_sb_validate_quota,
};
const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_quota dq;
@ -30,8 +45,8 @@ static const char * const bch2_quota_counters[] = {
"inodes",
};
static void bch2_quota_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_quota_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end= buf + size;
struct bkey_s_c_quota dq;
@ -50,11 +65,6 @@ static void bch2_quota_to_text(struct bch_fs *c, char *buf,
}
}
const struct bkey_ops bch2_bkey_quota_ops = {
.key_invalid = bch2_quota_invalid,
.val_to_text = bch2_quota_to_text,
};
#ifdef CONFIG_BCACHEFS_QUOTA
#include <linux/cred.h>
@ -399,7 +409,7 @@ static void bch2_sb_quota_read(struct bch_fs *c)
struct bch_sb_field_quota *sb_quota;
unsigned i, j;
sb_quota = bch2_sb_get_quota(c->disk_sb);
sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
if (!sb_quota)
return;
@ -476,13 +486,13 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_UDQ_ENFD)
SET_BCH_SB_USRQUOTA(c->disk_sb, true);
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
if (uflags & FS_QUOTA_GDQ_ENFD)
SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
if (uflags & FS_QUOTA_PDQ_ENFD)
SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@ -499,13 +509,13 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_UDQ_ENFD)
SET_BCH_SB_USRQUOTA(c->disk_sb, false);
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
if (uflags & FS_QUOTA_GDQ_ENFD)
SET_BCH_SB_GRPQUOTA(c->disk_sb, false);
SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
if (uflags & FS_QUOTA_PDQ_ENFD)
SET_BCH_SB_PRJQUOTA(c->disk_sb, false);
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@ -616,9 +626,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
q = &c->quotas[type];
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_quota(c->disk_sb);
sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
if (!sb_quota) {
sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64));
sb_quota = bch2_sb_resize_quota(&c->disk_sb,
sizeof(*sb_quota) / sizeof(u64));
if (!sb_quota)
return -ENOSPC;
}

View File

@ -1,9 +1,18 @@
#ifndef _BCACHEFS_QUOTA_H
#define _BCACHEFS_QUOTA_H
#include "inode.h"
#include "quota_types.h"
extern const struct bkey_ops bch2_bkey_quota_ops;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_quota_ops (struct bkey_ops) { \
.key_invalid = bch2_quota_invalid, \
.val_to_text = bch2_quota_to_text, \
}
enum quota_acct_mode {
BCH_QUOTA_PREALLOC,

698
libbcachefs/replicas.c Normal file
View File

@ -0,0 +1,698 @@
#include "bcachefs.h"
#include "replicas.h"
#include "super-io.h"
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Replicas tracking - in memory: */
#define for_each_cpu_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
}
static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
e->devs[dev >> 3] |= 1 << (dev & 7);
}
static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
{
return (r->entry_size -
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
char *buf, size_t size)
{
char *out = buf, *end = out + size;
struct bch_replicas_cpu_entry *e;
bool first = true;
unsigned i;
for_each_cpu_replicas_entry(r, e) {
bool first_e = true;
if (!first)
out += scnprintf(out, end - out, " ");
first = false;
out += scnprintf(out, end - out, "%u: [", e->data_type);
for (i = 0; i < replicas_dev_slots(r); i++)
if (replicas_test_dev(e, i)) {
if (!first_e)
out += scnprintf(out, end - out, " ");
first_e = false;
out += scnprintf(out, end - out, "%u", i);
}
out += scnprintf(out, end - out, "]");
}
return out - buf;
}
static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
const struct bch_extent_ptr *ptr;
unsigned nr = 0;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
*max_dev = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
replicas_set_dev(r, ptr->dev);
nr++;
}
return nr;
}
static inline void devlist_to_replicas(struct bch_devs_list devs,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
unsigned i;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
*max_dev = 0;
for (i = 0; i < devs.nr; i++) {
*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
replicas_set_dev(r, devs.devs[i]);
}
}
static struct bch_replicas_cpu *
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size;
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, old->entry_size);
nr = old->nr + 1;
new = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!new)
return NULL;
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < old->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(old, i),
min(new->entry_size, old->entry_size));
memcpy(cpu_replicas_entry(new, old->nr),
&new_entry,
new->entry_size);
bch2_cpu_replicas_sort(new);
return new;
}
static bool replicas_has_entry(struct bch_replicas_cpu *r,
struct bch_replicas_cpu_entry search,
unsigned max_dev)
{
return max_dev < replicas_dev_slots(r) &&
eytzinger0_find(r->entries, r->nr,
r->entry_size,
memcmp, &search) < r->nr;
}
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
int ret = -ENOMEM;
mutex_lock(&c->sb_lock);
old_gc = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
if (!new_gc)
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
if (!replicas_has_entry(old_r, new_entry, max_dev)) {
new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
if (!new_r)
goto err;
ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
if (ret)
goto err;
}
/* allocations done, now commit: */
if (new_r)
bch2_write_super(c);
/* don't update in memory replicas until changes are persistent */
if (new_gc) {
rcu_assign_pointer(c->replicas_gc, new_gc);
kfree_rcu(old_gc, rcu);
}
if (new_r) {
rcu_assign_pointer(c->replicas, new_r);
kfree_rcu(old_r, rcu);
}
mutex_unlock(&c->sb_lock);
return 0;
err:
mutex_unlock(&c->sb_lock);
if (new_gc)
kfree(new_gc);
if (new_r)
kfree(new_r);
return ret;
}
int bch2_mark_replicas(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
struct bch_replicas_cpu_entry search;
struct bch_replicas_cpu *r, *gc_r;
unsigned max_dev;
bool marked;
if (!devs.nr)
return 0;
BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
devlist_to_replicas(devs, data_type, &search, &max_dev);
rcu_read_lock();
r = rcu_dereference(c->replicas);
gc_r = rcu_dereference(c->replicas_gc);
marked = replicas_has_entry(r, search, max_dev) &&
(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
rcu_read_unlock();
return likely(marked) ? 0
: bch2_mark_replicas_slowpath(c, search, max_dev);
}
int bch2_mark_bkey_replicas(struct bch_fs *c,
enum bch_data_type data_type,
struct bkey_s_c k)
{
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
int ret;
for (i = 0; i < cached.nr; i++)
if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i]))))
return ret;
return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_replicas_cpu *new_r, *old_r;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
new_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(new_r, rcu);
goto err;
}
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
ret = -ENOSPC;
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, new_r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *dst, *src;
struct bch_replicas_cpu_entry *e;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
dst = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!dst) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
dst->nr = 0;
dst->entry_size = src->entry_size;
for_each_cpu_replicas_entry(src, e)
if (!((1 << e->data_type) & typemask))
memcpy(cpu_replicas_entry(dst, dst->nr++),
e, dst->entry_size);
bch2_cpu_replicas_sort(dst);
rcu_assign_pointer(c->replicas_gc, dst);
mutex_unlock(&c->sb_lock);
return 0;
}
/* Replicas tracking - superblock: */
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
unsigned *max_dev)
{
struct bch_replicas_entry *i;
unsigned j;
*nr = 0;
*bytes = sizeof(*r);
*max_dev = 0;
if (!r)
return;
for_each_replicas_entry(r, i) {
for (j = 0; j < i->nr; j++)
*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
(*nr)++;
}
*bytes = (void *) i - (void *) r;
}
static struct bch_replicas_cpu *
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
{
struct bch_replicas_cpu *cpu_r;
unsigned i, nr, bytes, max_dev, entry_size;
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!cpu_r)
return NULL;
cpu_r->nr = nr;
cpu_r->entry_size = entry_size;
if (nr) {
struct bch_replicas_cpu_entry *dst =
cpu_replicas_entry(cpu_r, 0);
struct bch_replicas_entry *src = sb_r->entries;
while (dst < cpu_replicas_entry(cpu_r, nr)) {
dst->data_type = src->data_type;
for (i = 0; i < src->nr; i++)
replicas_set_dev(dst, src->devs[i]);
src = replicas_entry_next(src);
dst = (void *) dst + entry_size;
}
}
bch2_cpu_replicas_sort(cpu_r);
return cpu_r;
}
int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r;
sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
return -ENOMEM;
old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, cpu_r);
if (old_r)
kfree_rcu(old_r, rcu);
return 0;
}
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *sb_e;
struct bch_replicas_cpu_entry *e;
size_t i, bytes;
bytes = sizeof(struct bch_sb_field_replicas);
for_each_cpu_replicas_entry(r, e) {
bytes += sizeof(struct bch_replicas_entry);
for (i = 0; i < r->entry_size - 1; i++)
bytes += hweight8(e->devs[i]);
}
sb_r = bch2_sb_resize_replicas(&c->disk_sb,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r)
return -ENOSPC;
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
(void *) &sb_r->entries);
sb_e = sb_r->entries;
for_each_cpu_replicas_entry(r, e) {
sb_e->data_type = e->data_type;
for (i = 0; i < replicas_dev_slots(r); i++)
if (replicas_test_dev(e, i))
sb_e->devs[sb_e->nr++] = i;
sb_e = replicas_entry_next(sb_e);
BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
}
return 0;
}
static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
for_each_replicas_entry(sb_r, e) {
err = "invalid replicas entry: invalid data type";
if (e->data_type >= BCH_DATA_NR)
goto err;
err = "invalid replicas entry: no devices";
if (!e->nr)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
.validate = bch2_sb_validate_replicas,
};
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
{
char *out = buf, *end = out + size;
struct bch_replicas_entry *e;
bool first = true;
unsigned i;
if (!r) {
out += scnprintf(out, end - out, "(no replicas section found)");
return out - buf;
}
for_each_replicas_entry(r, e) {
if (!first)
out += scnprintf(out, end - out, " ");
first = false;
out += scnprintf(out, end - out, "%u: [", e->data_type);
for (i = 0; i < e->nr; i++)
out += scnprintf(out, end - out,
i ? " %u" : "%u", e->devs[i]);
out += scnprintf(out, end - out, "]");
}
return out - buf;
}
/* Query replicas: */
bool bch2_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
struct bch_replicas_cpu_entry search;
unsigned max_dev;
bool ret;
if (!devs.nr)
return true;
devlist_to_replicas(devs, data_type, &search, &max_dev);
rcu_read_lock();
ret = replicas_has_entry(rcu_dereference(c->replicas),
search, max_dev);
rcu_read_unlock();
return ret;
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bkey_s_c k)
{
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
for (i = 0; i < cached.nr; i++)
if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i])))
return false;
return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{
struct bch_sb_field_members *mi;
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline;
struct replicas_status ret;
memset(&ret, 0, sizeof(ret));
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX;
mi = bch2_sb_get_members(c->disk_sb.sb);
rcu_read_lock();
r = rcu_dereference(c->replicas);
dev_slots = replicas_dev_slots(r);
for_each_cpu_replicas_entry(r, e) {
if (e->data_type >= ARRAY_SIZE(ret.replicas))
panic("e %p data_type %u\n", e, e->data_type);
nr_online = nr_offline = 0;
for (dev = 0; dev < dev_slots; dev++) {
if (!replicas_test_dev(e, dev))
continue;
BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
if (test_bit(dev, online_devs.d))
nr_online++;
else
nr_offline++;
}
ret.replicas[e->data_type].nr_online =
min(ret.replicas[e->data_type].nr_online,
nr_online);
ret.replicas[e->data_type].nr_offline =
max(ret.replicas[e->data_type].nr_offline,
nr_offline);
}
rcu_read_unlock();
return ret;
}
struct replicas_status bch2_replicas_status(struct bch_fs *c)
{
return __bch2_replicas_status(c, bch2_online_devs(c));
}
static bool have_enough_devs(struct replicas_status s,
enum bch_data_type type,
bool force_if_degraded,
bool force_if_lost)
{
return (!s.replicas[type].nr_offline || force_if_degraded) &&
(s.replicas[type].nr_online || force_if_lost);
}
bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
{
return (have_enough_devs(s, BCH_DATA_JOURNAL,
flags & BCH_FORCE_IF_METADATA_DEGRADED,
flags & BCH_FORCE_IF_METADATA_LOST) &&
have_enough_devs(s, BCH_DATA_BTREE,
flags & BCH_FORCE_IF_METADATA_DEGRADED,
flags & BCH_FORCE_IF_METADATA_LOST) &&
have_enough_devs(s, BCH_DATA_USER,
flags & BCH_FORCE_IF_DATA_DEGRADED,
flags & BCH_FORCE_IF_DATA_LOST));
}
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
{
struct replicas_status s = bch2_replicas_status(c);
return meta
? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
s.replicas[BCH_DATA_BTREE].nr_online)
: s.replicas[BCH_DATA_USER].nr_online;
}
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned ret = 0;
rcu_read_lock();
r = rcu_dereference(c->replicas);
if (ca->dev_idx >= replicas_dev_slots(r))
goto out;
for_each_cpu_replicas_entry(r, e)
if (replicas_test_dev(e, ca->dev_idx))
ret |= 1 << e->data_type;
out:
rcu_read_unlock();
return ret;
}

51
libbcachefs/replicas.h Normal file
View File

@ -0,0 +1,51 @@
#ifndef _BCACHEFS_REPLICAS_H
#define _BCACHEFS_REPLICAS_H
bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bkey_s_c);
int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
struct bkey_s_c);
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
struct replicas_status {
struct {
unsigned nr_online;
unsigned nr_offline;
} replicas[BCH_DATA_NR];
};
struct replicas_status __bch2_replicas_status(struct bch_fs *,
struct bch_devs_mask);
struct replicas_status bch2_replicas_status(struct bch_fs *);
bool bch2_have_enough_devs(struct replicas_status, unsigned);
unsigned bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
/* iterate over superblock replicas - used by userspace tools: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
#endif /* _BCACHEFS_REPLICAS_H */

File diff suppressed because it is too large Load Diff

View File

@ -11,8 +11,6 @@
struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
enum bch_sb_field_type, unsigned);
struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *,
enum bch_sb_field_type, unsigned);
#define field_to_type(_f, _name) \
container_of_or_null(_f, struct bch_sb_field_##_name, field)
@ -30,13 +28,6 @@ bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \
{ \
return field_to_type(bch2_sb_field_resize(sb, \
BCH_SB_FIELD_##_name, u64s), _name); \
} \
\
static inline struct bch_sb_field_##_name * \
bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \
{ \
return field_to_type(bch2_fs_sb_field_resize(c, \
BCH_SB_FIELD_##_name, u64s), _name); \
}
BCH_SB_FIELDS()
@ -44,6 +35,12 @@ BCH_SB_FIELDS()
extern const char * const bch2_sb_fields[];
struct bch_sb_field_ops {
const char * (*validate)(struct bch_sb *, struct bch_sb_field *);
size_t (*to_text)(char *, size_t, struct bch_sb *,
struct bch_sb_field *);
};
static inline bool bch2_sb_test_feature(struct bch_sb *sb,
enum bch_sb_features f)
{
@ -90,7 +87,7 @@ int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_super_realloc(struct bch_sb_handle *, unsigned);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
const char *bch2_sb_validate(struct bch_sb_handle *);
@ -139,135 +136,4 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
};
}
/* BCH_SB_FIELD_replicas: */
bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bkey_s_c);
int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
struct bkey_s_c);
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
struct replicas_status {
struct {
unsigned nr_online;
unsigned nr_offline;
} replicas[BCH_DATA_NR];
};
struct replicas_status __bch2_replicas_status(struct bch_fs *,
struct bch_devs_mask);
struct replicas_status bch2_replicas_status(struct bch_fs *);
bool bch2_have_enough_devs(struct replicas_status, unsigned);
unsigned bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
/* iterate over superblock replicas - used by userspace tools: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
/* disk groups: */
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
{
return groups
? (vstruct_end(&groups->field) -
(void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
: 0;
}
struct target {
enum {
TARGET_NULL,
TARGET_DEV,
TARGET_GROUP,
} type;
union {
unsigned dev;
unsigned group;
};
};
#define TARGET_DEV_START 1
#define TARGET_GROUP_START (256 + TARGET_DEV_START)
static inline u16 dev_to_target(unsigned dev)
{
return TARGET_DEV_START + dev;
}
static inline u16 group_to_target(unsigned group)
{
return TARGET_GROUP_START + group;
}
static inline struct target target_decode(unsigned target)
{
if (target >= TARGET_GROUP_START)
return (struct target) {
.type = TARGET_GROUP,
.group = target - TARGET_GROUP_START
};
if (target >= TARGET_DEV_START)
return (struct target) {
.type = TARGET_DEV,
.group = target - TARGET_DEV_START
};
return (struct target) { .type = TARGET_NULL };
}
static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_NULL:
return false;
case TARGET_DEV:
return ca->dev_idx == t.dev;
case TARGET_GROUP:
return ca->mi.group && ca->mi.group - 1 == t.group;
default:
BUG();
}
}
static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
{
bool ret;
rcu_read_lock();
ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
rcu_read_unlock();
return ret;
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
#endif /* _BCACHEFS_SUPER_IO_H */

View File

@ -18,6 +18,7 @@
#include "clock.h"
#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
@ -30,6 +31,7 @@
#include "migrate.h"
#include "movinggc.h"
#include "quota.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
@ -122,7 +124,7 @@ static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
lockdep_assert_held(&bch_fs_list_lock);
list_for_each_entry(c, &bch_fs_list, list)
if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
return c;
return NULL;
@ -203,23 +205,12 @@ static void bch_fs_mark_clean(struct bch_fs *c)
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
}
static bool btree_interior_updates_done(struct bch_fs *c)
{
bool ret;
mutex_lock(&c->btree_interior_update_lock);
ret = list_empty(&c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
return ret;
}
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
@ -251,7 +242,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* fully complete:
*/
closure_wait_event(&c->btree_interior_update_wait,
btree_interior_updates_done(c));
!bch2_btree_interior_updates_nr_pending(c));
if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_btree_verify_flushed(c);
@ -433,7 +424,8 @@ static void bch2_fs_free(struct bch_fs *c)
if (c->wq)
destroy_workqueue(c->wq);
free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
free_pages((unsigned long) c->disk_sb.sb,
c->disk_sb.page_order);
kvpfree(c, sizeof(*c));
module_put(THIS_MODULE);
}
@ -501,11 +493,54 @@ void bch2_fs_stop(struct bch_fs *c)
kobject_put(&c->kobj);
}
static const char *bch2_fs_online(struct bch_fs *c)
{
struct bch_dev *ca;
const char *err = NULL;
unsigned i;
int ret;
lockdep_assert_held(&bch_fs_list_lock);
if (!list_empty(&c->list))
return NULL;
if (__bch2_uuid_to_fs(c->sb.uuid))
return "filesystem UUID already open";
ret = bch2_fs_chardev_init(c);
if (ret)
return "error creating character device";
bch2_fs_debug_init(c);
if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
kobject_add(&c->internal, &c->kobj, "internal") ||
kobject_add(&c->opts_dir, &c->kobj, "options") ||
kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
bch2_opts_create_sysfs_files(&c->opts_dir))
return "error creating sysfs objects";
mutex_lock(&c->state_lock);
err = "error creating sysfs objects";
__for_each_member_device(ca, c, i, NULL)
if (bch2_dev_sysfs_online(c, ca))
goto err;
list_add(&c->list, &bch_fs_list);
err = NULL;
err:
mutex_unlock(&c->state_lock);
return err;
}
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
unsigned i, iter_size;
const char *err;
pr_verbose_init(opts, "");
@ -516,6 +551,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
__module_get(THIS_MODULE);
c->minor = -1;
c->disk_sb.fs_sb = true;
mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
@ -627,9 +663,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
mi = bch2_sb_get_members(c->disk_sb);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
bch2_dev_alloc(c, i))
goto err;
@ -644,6 +680,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
mutex_lock(&bch_fs_list_lock);
err = bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
if (err) {
bch_err(c, "bch2_fs_online() error: %s", err);
goto err;
}
out:
pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
return c;
@ -653,60 +697,7 @@ err:
goto out;
}
static const char *__bch2_fs_online(struct bch_fs *c)
{
struct bch_dev *ca;
const char *err = NULL;
unsigned i;
int ret;
lockdep_assert_held(&bch_fs_list_lock);
if (!list_empty(&c->list))
return NULL;
if (__bch2_uuid_to_fs(c->sb.uuid))
return "filesystem UUID already open";
ret = bch2_fs_chardev_init(c);
if (ret)
return "error creating character device";
bch2_fs_debug_init(c);
if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
kobject_add(&c->internal, &c->kobj, "internal") ||
kobject_add(&c->opts_dir, &c->kobj, "options") ||
kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
bch2_opts_create_sysfs_files(&c->opts_dir))
return "error creating sysfs objects";
mutex_lock(&c->state_lock);
err = "error creating sysfs objects";
__for_each_member_device(ca, c, i, NULL)
if (bch2_dev_sysfs_online(c, ca))
goto err;
list_add(&c->list, &bch_fs_list);
err = NULL;
err:
mutex_unlock(&c->state_lock);
return err;
}
static const char *bch2_fs_online(struct bch_fs *c)
{
const char *err;
mutex_lock(&bch_fs_list_lock);
err = __bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
return err;
}
static const char *__bch2_fs_start(struct bch_fs *c)
const char *bch2_fs_start(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
@ -730,15 +721,15 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (BCH_SB_INITIALIZED(c->disk_sb)) {
if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
ret = bch2_journal_read(c, &journal);
if (ret)
goto err;
j = &list_entry(journal.prev, struct journal_replay, list)->j;
c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
@ -824,21 +815,18 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
err = "unable to allocate journal buckets";
for_each_rw_member(ca, c, i)
if (bch2_dev_journal_alloc(c, ca)) {
for_each_online_member(ca, c, i)
if (bch2_dev_journal_alloc(ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
@ -889,18 +877,20 @@ recovery_done:
}
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
mi = bch2_sb_get_members(c->disk_sb.sb);
now = ktime_get_seconds();
for_each_member_device(ca, c, i)
mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
SET_BCH_SB_INITIALIZED(c->disk_sb, true);
SET_BCH_SB_CLEAN(c->disk_sb, false);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_STARTED, &c->flags);
err = NULL;
out:
mutex_unlock(&c->state_lock);
@ -939,11 +929,6 @@ fsck_err:
goto out;
}
const char *bch2_fs_start(struct bch_fs *c)
{
return __bch2_fs_start(c) ?: bch2_fs_online(c);
}
static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
{
struct bch_sb_field_members *sb_mi;
@ -956,7 +941,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
return "mismatched block size";
if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
return "new cache bucket size is too small";
return NULL;
@ -1082,28 +1067,19 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
return 0;
}
static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
struct bch_member *member)
{
struct bch_member *member;
struct bch_dev *ca = NULL;
int ret = 0;
pr_verbose_init(c->opts, "");
if (bch2_fs_init_fault("dev_alloc"))
goto err;
struct bch_dev *ca;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto err;
return NULL;
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
init_rwsem(&ca->bucket_lock);
writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
@ -1113,14 +1089,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
if (bch2_fs_init_fault("dev_alloc"))
goto err;
member = bch2_sb_get_members(c->disk_sb)->members + dev_idx;
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
@ -1132,11 +1102,43 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
return ca;
err:
bch2_dev_free(ca);
return NULL;
}
static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
unsigned dev_idx)
{
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
if (bch2_dev_sysfs_online(c, ca))
pr_warn("error creating sysfs objects");
}
static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member *member =
bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
struct bch_dev *ca = NULL;
int ret = 0;
pr_verbose_init(c->opts, "");
if (bch2_fs_init_fault("dev_alloc"))
goto err;
ca = __bch2_dev_alloc(c, member);
if (!ca)
goto err;
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
@ -1147,21 +1149,9 @@ err:
goto out;
}
static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
{
struct bch_dev *ca;
int ret;
lockdep_assert_held(&c->state_lock);
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb);
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]);
ca = bch_dev_locked(c, sb->sb->dev_idx);
unsigned ret;
if (bch2_dev_is_online(ca)) {
bch_err(ca, "already have device online in slot %u",
@ -1179,7 +1169,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
bch_err(c, "device too small");
bch_err(ca, "device too small");
return -EINVAL;
}
@ -1187,35 +1177,50 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
/*
* Increase journal write timeout if flushes to this device are
* expensive:
*/
if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
journal_flushes_device(ca))
c->journal.write_delay_ms =
max(c->journal.write_delay_ms, 1000U);
/* Commit: */
ca->disk_sb = *sb;
if (sb->mode & FMODE_EXCL)
ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
if (ca->fs)
mutex_lock(&ca->fs->sb_lock);
bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->fs)
mutex_unlock(&ca->fs->sb_lock);
percpu_ref_reinit(&ca->io_ref);
return 0;
}
static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
{
struct bch_dev *ca;
int ret;
lockdep_assert_held(&c->state_lock);
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb.sb->seq))
bch2_sb_to_fs(c, sb->sb);
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]);
ca = bch_dev_locked(c, sb->sb->dev_idx);
ret = __bch2_dev_attach_bdev(ca, sb);
if (ret)
return ret;
if (c->sb.nr_devices == 1)
bdevname(ca->disk_sb.bdev, c->name);
bdevname(ca->disk_sb.bdev, ca->name);
mutex_lock(&c->sb_lock);
bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
mutex_unlock(&c->sb_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
rebalance_wakeup(c);
percpu_ref_reinit(&ca->io_ref);
return 0;
}
@ -1289,10 +1294,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
if (!c->opts.degraded) {
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->disk_sb->nr_devices; i++) {
if (!bch2_dev_exists(c->disk_sb, mi, i))
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
continue;
ca = bch_dev_locked(c, i);
@ -1360,7 +1365,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch_notice(ca, "%s", bch2_dev_state[new_state]);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
mi = bch2_sb_get_members(c->disk_sb.sb);
SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@ -1470,7 +1475,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* this device must be gone:
*/
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
mi = bch2_sb_get_members(c->disk_sb.sb);
memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
bch2_write_super(c);
@ -1492,8 +1497,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_sb_handle sb;
const char *err;
struct bch_dev *ca = NULL;
struct bch_sb_field_members *mi, *dev_mi;
struct bch_member saved_mi;
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
int ret;
@ -1505,24 +1510,52 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (err)
return -EINVAL;
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
err = bch2_dev_may_add(sb.sb, c);
if (err)
return -EINVAL;
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
bch2_free_super(&sb);
return -ENOMEM;
}
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret) {
bch2_dev_free(ca);
return ret;
}
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
if (ret)
goto err;
mutex_lock(&c->state_lock);
mutex_lock(&c->sb_lock);
/* Grab member info for new disk: */
dev_mi = bch2_sb_get_members(sb.sb);
saved_mi = dev_mi->members[sb.sb->dev_idx];
saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
err = "insufficient space in new superblock";
ret = bch2_sb_from_fs(c, ca);
if (ret)
goto err_unlock;
mi = bch2_sb_get_members(ca->disk_sb.sb);
if (!bch2_sb_resize_members(&ca->disk_sb,
le32_to_cpu(mi->field.u64s) +
sizeof(dev_mi) / sizeof(u64))) {
ret = -ENOSPC;
goto err_unlock;
}
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
mi = bch2_sb_get_members(c->disk_sb);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
goto have_slot;
no_slot:
err = "no slots available in superblock";
@ -1533,64 +1566,47 @@ have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
u64s = (sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
err = "no space in superblock for member info";
ret = -ENOSPC;
dev_mi = bch2_sb_resize_members(&sb, u64s);
if (!dev_mi)
goto err_unlock;
mi = bch2_fs_sb_resize_members(c, u64s);
mi = bch2_sb_resize_members(&c->disk_sb, u64s);
if (!mi)
goto err_unlock;
memcpy(dev_mi, mi, u64s * sizeof(u64));
dev_mi->members[dev_idx] = saved_mi;
/* success: */
sb.sb->uuid = c->disk_sb->uuid;
sb.sb->dev_idx = dev_idx;
sb.sb->nr_devices = nr_devices;
mi->members[dev_idx] = dev_mi;
mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
c->disk_sb.sb->nr_devices = nr_devices;
/* commit new member info */
memcpy(mi, dev_mi, u64s * sizeof(u64));
c->disk_sb->nr_devices = nr_devices;
c->sb.nr_devices = nr_devices;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (bch2_dev_alloc(c, dev_idx)) {
err = "cannot allocate memory";
ret = -ENOMEM;
goto err;
}
if (__bch2_dev_online(c, &sb)) {
err = "bch2_dev_online() error";
ret = -ENOMEM;
goto err;
}
ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
goto err;
err = "journal alloc failed";
if (bch2_dev_journal_alloc(c, ca))
goto err;
goto err_late;
}
mutex_unlock(&c->state_lock);
return 0;
err_unlock:
mutex_unlock(&c->sb_lock);
err:
mutex_unlock(&c->state_lock);
err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
bch_err(c, "Unable to add device: %s", err);
return ret ?: -EINVAL;
return ret;
err_late:
bch_err(c, "Error going rw after adding device: %s", err);
return -EINVAL;
}
/* Hot add existing device to running filesystem: */
@ -1613,12 +1629,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
err = bch2_dev_in_fs(c->disk_sb, sb.sb);
err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
if (err)
goto err;
if (__bch2_dev_online(c, &sb)) {
err = "__bch2_dev_online() error";
if (bch2_dev_attach_bdev(c, &sb)) {
err = "bch2_dev_attach_bdev() error";
goto err;
}
@ -1688,7 +1704,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
}
mutex_lock(&c->sb_lock);
mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
mi->nbuckets = cpu_to_le64(nbuckets);
bch2_write_super(c);
@ -1721,74 +1737,6 @@ found:
return ca;
}
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label)
{
struct bch_sb_field_disk_groups *groups;
struct bch_disk_group *g;
struct bch_member *mi;
unsigned i, v, nr_groups;
int ret;
if (strlen(label) > BCH_SB_LABEL_SIZE)
return -EINVAL;
mutex_lock(&c->sb_lock);
groups = bch2_sb_get_disk_groups(c->disk_sb);
nr_groups = disk_groups_nr(groups);
if (!strcmp(label, "none")) {
v = 0;
goto write_sb;
}
ret = __bch2_disk_group_find(groups, label);
if (ret >= 0) {
v = ret + 1;
goto write_sb;
}
/* not found - create a new disk group: */
for (i = 0;
i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
i++)
;
if (i == nr_groups) {
unsigned u64s =
(sizeof(struct bch_sb_field_disk_groups) +
sizeof(struct bch_disk_group) * (nr_groups + 1)) /
sizeof(u64);
groups = bch2_fs_sb_resize_disk_groups(c, u64s);
if (!groups) {
mutex_unlock(&c->sb_lock);
return -ENOSPC;
}
nr_groups = disk_groups_nr(groups);
}
BUG_ON(i >= nr_groups);
g = &groups->entries[i];
v = i + 1;
memcpy(g->label, label, strlen(label));
if (strlen(label) < sizeof(g->label))
g->label[strlen(label)] = '\0';
SET_BCH_GROUP_DELETED(g, 0);
SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
write_sb:
mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
/* Filesystem open: */
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
@ -1845,7 +1793,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
err = "bch2_dev_online() error";
mutex_lock(&c->state_lock);
for (i = 0; i < nr_devices; i++)
if (__bch2_dev_online(c, &sb[i])) {
if (bch2_dev_attach_bdev(c, &sb[i])) {
mutex_unlock(&c->state_lock);
goto err_print;
}
@ -1856,15 +1804,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err_print;
if (!c->opts.nostart) {
err = __bch2_fs_start(c);
err = bch2_fs_start(c);
if (err)
goto err_print;
}
err = bch2_fs_online(c);
if (err)
goto err_print;
out:
kfree(sb);
module_put(THIS_MODULE);
@ -1900,7 +1843,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
if (c) {
closure_get(&c->cl);
err = bch2_dev_in_fs(c->disk_sb, sb->sb);
err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
if (err)
goto err;
} else {
@ -1915,22 +1858,18 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
err = "bch2_dev_online() error";
mutex_lock(&c->sb_lock);
if (__bch2_dev_online(c, sb)) {
if (bch2_dev_attach_bdev(c, sb)) {
mutex_unlock(&c->sb_lock);
goto err;
}
mutex_unlock(&c->sb_lock);
if (!c->opts.nostart && bch2_fs_may_start(c)) {
err = __bch2_fs_start(c);
err = bch2_fs_start(c);
if (err)
goto err;
}
err = __bch2_fs_online(c);
if (err)
goto err;
closure_put(&c->cl);
mutex_unlock(&bch_fs_list_lock);

View File

@ -195,7 +195,6 @@ int bch2_dev_online(struct bch_fs *, const char *);
int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);

View File

@ -7,6 +7,9 @@ struct bch_sb_handle {
struct bio *bio;
unsigned page_order;
fmode_t mode;
unsigned have_layout:1;
unsigned have_bio:1;
unsigned fs_sb:1;
};
struct bch_devs_mask {
@ -44,8 +47,9 @@ struct bch_replicas_cpu {
};
struct bch_disk_group_cpu {
struct bch_devs_mask devs;
bool deleted;
u16 parent;
struct bch_devs_mask devs;
};
struct bch_disk_groups_cpu {

View File

@ -18,11 +18,13 @@
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "disk_groups.h"
#include "inode.h"
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "opts.h"
#include "replicas.h"
#include "super-io.h"
#include "tier.h"
@ -140,10 +142,10 @@ read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(durability);
read_attribute(iostats);
read_attribute(read_priority_stats);
read_attribute(write_priority_stats);
read_attribute(fragmentation_stats);
read_attribute(oldest_gen_stats);
read_attribute(last_read_quantiles);
read_attribute(last_write_quantiles);
read_attribute(fragmentation_quantiles);
read_attribute(oldest_gen_quantiles);
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@ -167,7 +169,7 @@ rw_attribute(journal_reclaim_delay_ms);
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
rw_attribute(group);
rw_attribute(label);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
@ -546,7 +548,7 @@ STORE(bch2_fs_opts_dir)
if (opt->set_sb != SET_NO_SB_OPT) {
mutex_lock(&c->sb_lock);
opt->set_sb(c->disk_sb, v);
opt->set_sb(c->disk_sb.sb, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
@ -621,36 +623,41 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *);
typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
size_t, void *);
static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b,
void *private)
static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
struct bucket *g = bucket(ca, b);
int rw = (private ? 1 : 0);
return ca->fs->prio_clock[rw].hand - g->prio[rw];
return bucket_last_io(c, bucket(ca, b), rw);
}
static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b,
void *private)
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
struct bucket *g = bucket(ca, b);
return bucket_sectors_used(g->mark);
}
static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b,
void *private)
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
return bucket_gc_gen(ca, b);
}
static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
bucket_map_fn *fn, void *private)
static int unsigned_cmp(const void *_l, const void *_r)
{
int cmp(const void *l, const void *r)
{ return *((unsigned *) r) - *((unsigned *) l); }
unsigned l = *((unsigned *) _l);
unsigned r = *((unsigned *) _r);
return (l > r) - (l < r);
}
static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
char *buf, bucket_map_fn *fn, void *private)
{
size_t i, n;
/* Compute 31 quantiles */
unsigned q[31], *p;
@ -666,9 +673,9 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
}
for (i = ca->mi.first_bucket; i < n; i++)
p[i] = fn(ca, i, private);
p[i] = fn(c, ca, i, private);
sort(p, n, sizeof(unsigned), cmp, NULL);
sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
up_read(&ca->bucket_lock);
while (n &&
@ -804,24 +811,18 @@ SHOW(bch2_dev)
sysfs_print(durability, ca->mi.durability);
sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_group) {
struct bch_sb_field_disk_groups *groups;
struct bch_disk_group *g;
unsigned len;
if (attr == &sysfs_label) {
if (ca->mi.group) {
mutex_lock(&c->sb_lock);
out += bch2_disk_path_print(&c->disk_sb, out, end - out,
ca->mi.group - 1);
mutex_unlock(&c->sb_lock);
} else {
out += scnprintf(out, end - out, "none");
}
if (!ca->mi.group)
return scnprintf(out, end - out, "none\n");
mutex_lock(&c->sb_lock);
groups = bch2_sb_get_disk_groups(c->disk_sb);
g = &groups->entries[ca->mi.group - 1];
len = strnlen(g->label, sizeof(g->label));
memcpy(buf, g->label, len);
mutex_unlock(&c->sb_lock);
buf[len++] = '\n';
return len;
out += scnprintf(out, end - out, "\n");
return out - buf;
}
if (attr == &sysfs_has_data) {
@ -852,14 +853,16 @@ SHOW(bch2_dev)
if (attr == &sysfs_iostats)
return show_dev_iostats(ca, buf);
if (attr == &sysfs_read_priority_stats)
return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
if (attr == &sysfs_write_priority_stats)
return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
if (attr == &sysfs_fragmentation_stats)
return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
if (attr == &sysfs_oldest_gen_stats)
return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
if (attr == &sysfs_last_read_quantiles)
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
if (attr == &sysfs_last_write_quantiles)
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
if (attr == &sysfs_fragmentation_quantiles)
return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
if (attr == &sysfs_oldest_gen_quantiles)
return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
if (attr == &sysfs_reserve_stats)
return show_reserve_stats(ca, buf);
if (attr == &sysfs_alloc_debug)
@ -880,7 +883,7 @@ STORE(bch2_dev)
bool v = strtoul_or_return(buf);
mutex_lock(&c->sb_lock);
mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
if (v != BCH_MEMBER_DISCARD(mi)) {
SET_BCH_MEMBER_DISCARD(mi, v);
@ -896,7 +899,7 @@ STORE(bch2_dev)
return v;
mutex_lock(&c->sb_lock);
mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
SET_BCH_MEMBER_REPLACEMENT(mi, v);
@ -905,7 +908,7 @@ STORE(bch2_dev)
mutex_unlock(&c->sb_lock);
}
if (attr == &sysfs_group) {
if (attr == &sysfs_label) {
char *tmp;
int ret;
@ -938,16 +941,16 @@ struct attribute *bch2_dev_files[] = {
&sysfs_discard,
&sysfs_cache_replacement_policy,
&sysfs_state_rw,
&sysfs_group,
&sysfs_label,
&sysfs_has_data,
&sysfs_iostats,
/* alloc info - other stats: */
&sysfs_read_priority_stats,
&sysfs_write_priority_stats,
&sysfs_fragmentation_stats,
&sysfs_oldest_gen_stats,
&sysfs_last_read_quantiles,
&sysfs_last_write_quantiles,
&sysfs_fragmentation_quantiles,
&sysfs_oldest_gen_quantiles,
&sysfs_reserve_stats,
/* debug: */

View File

@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
#include "extents.h"
#include "io.h"
#include "move.h"

View File

@ -86,8 +86,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
.cmp_bkey = xattr_cmp_bkey,
};
static const char *bch2_xattr_invalid(const struct bch_fs *c,
struct bkey_s_c k)
const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
@ -126,8 +125,8 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c,
}
}
static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
void bch2_xattr_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
@ -159,11 +158,6 @@ static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
}
}
const struct bkey_ops bch2_bkey_xattr_ops = {
.key_invalid = bch2_xattr_invalid,
.val_to_text = bch2_xattr_to_text,
};
int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
const char *name, void *buffer, size_t size, int type)
{

View File

@ -4,7 +4,14 @@
#include "str_hash.h"
extern const struct bch_hash_desc bch2_xattr_hash_desc;
extern const struct bkey_ops bch2_bkey_xattr_ops;
const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_xattr_ops (struct bkey_ops) { \
.key_invalid = bch2_xattr_invalid, \
.val_to_text = bch2_xattr_to_text, \
}
struct dentry;
struct xattr_handler;