Update bcachefs sources to e57b5958cf bcachefs: fix for building in userspace

This commit is contained in:
Kent Overstreet 2017-12-13 16:01:18 -05:00
parent f2feceddae
commit ea83a3985d
50 changed files with 3422 additions and 3284 deletions

View File

@ -1 +1 @@
192d759a491f50d92c89c2e842639d2307c815a5
e57b5958cf4e8530d26f7c36a6e1427fb284cc70

View File

@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c,
if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret));
bch2_write_op_init(&op, c, res, NULL, 0,
bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl);

View File

@ -98,23 +98,6 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
DECLARE_EVENT_CLASS(page_alloc_fail,
TP_PROTO(struct bch_fs *c, u64 size),
TP_ARGS(c, size),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(u64, size )
),
TP_fast_assign(
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->size = size;
),
TP_printk("%pU size %llu", __entry->uuid, __entry->size)
);
/* io.c: */
DEFINE_EVENT(bio, read_split,
@ -137,34 +120,6 @@ DEFINE_EVENT(bio, promote,
TP_ARGS(bio)
);
TRACE_EVENT(write_throttle,
TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay),
TP_ARGS(c, inode, bio, delay),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(u64, inode )
__field(sector_t, sector )
__field(unsigned int, nr_sector )
__array(char, rwbs, 6 )
__field(u64, delay )
),
TP_fast_assign(
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->inode = inode;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
__entry->delay = delay;
),
TP_printk("%pU inode %llu %s %llu + %u delay %llu",
__entry->uuid, __entry->inode,
__entry->rwbs, (unsigned long long)__entry->sector,
__entry->nr_sector, __entry->delay)
);
/* Journal */
DEFINE_EVENT(bch_fs, journal_full,
@ -439,16 +394,6 @@ TRACE_EVENT(alloc_batch,
__entry->uuid, __entry->free, __entry->total)
);
DEFINE_EVENT(bch_dev, prio_write_start,
TP_PROTO(struct bch_dev *ca),
TP_ARGS(ca)
);
DEFINE_EVENT(bch_dev, prio_write_end,
TP_PROTO(struct bch_dev *ca),
TP_ARGS(ca)
);
TRACE_EVENT(invalidate,
TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
TP_ARGS(ca, offset, sectors),
@ -502,151 +447,29 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
TP_ARGS(ca, reserve)
);
TRACE_EVENT(freelist_empty_fail,
TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve,
struct closure *cl),
TP_ARGS(c, reserve, cl),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(enum alloc_reserve, reserve )
__field(struct closure *, cl )
),
TP_fast_assign(
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->reserve = reserve;
__entry->cl = cl;
),
TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve,
__entry->cl)
);
DECLARE_EVENT_CLASS(open_bucket_alloc,
TP_PROTO(struct bch_fs *c, struct closure *cl),
TP_ARGS(c, cl),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(struct closure *, cl )
),
TP_fast_assign(
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->cl = cl;
),
TP_printk("%pU cl %p",
__entry->uuid, __entry->cl)
);
DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc,
TP_PROTO(struct bch_fs *c, struct closure *cl),
TP_ARGS(c, cl)
);
DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail,
TP_PROTO(struct bch_fs *c, struct closure *cl),
TP_ARGS(c, cl)
DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
TP_ARGS(ca, reserve)
);
/* Moving IO */
DECLARE_EVENT_CLASS(moving_io,
TP_PROTO(struct bkey *k),
TP_ARGS(k),
TP_STRUCT__entry(
__field(__u32, inode )
__field(__u64, offset )
__field(__u32, sectors )
),
TP_fast_assign(
__entry->inode = k->p.inode;
__entry->offset = k->p.offset;
__entry->sectors = k->size;
),
TP_printk("%u:%llu sectors %u",
__entry->inode, __entry->offset, __entry->sectors)
);
DEFINE_EVENT(moving_io, move_read,
TP_PROTO(struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(moving_io, move_read_done,
TP_PROTO(struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(moving_io, move_write,
TP_PROTO(struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(moving_io, copy_collision,
TP_PROTO(struct bkey *k),
TP_ARGS(k)
);
/* Copy GC */
DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail,
TP_PROTO(struct bch_fs *c, u64 size),
TP_ARGS(c, size)
);
DEFINE_EVENT(bch_dev, moving_gc_start,
TP_PROTO(struct bch_dev *ca),
TP_ARGS(ca)
);
TRACE_EVENT(moving_gc_end,
TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved,
u64 buckets_moved),
TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(u64, sectors_moved )
__field(u64, keys_moved )
__field(u64, buckets_moved )
),
TP_fast_assign(
memcpy(__entry->uuid, ca->uuid.b, 16);
__entry->sectors_moved = sectors_moved;
__entry->keys_moved = keys_moved;
__entry->buckets_moved = buckets_moved;
),
TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu",
__entry->uuid, __entry->sectors_moved, __entry->keys_moved,
__entry->buckets_moved)
);
DEFINE_EVENT(bkey, gc_copy,
DEFINE_EVENT(bkey, move_extent,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
/* Tiering */
DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail,
TP_PROTO(struct bch_fs *c, u64 size),
TP_ARGS(c, size)
DEFINE_EVENT(bkey, move_alloc_fail,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
DEFINE_EVENT(bch_fs, tiering_start,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
DEFINE_EVENT(bkey, move_race,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
TRACE_EVENT(tiering_end,
TRACE_EVENT(move_data,
TP_PROTO(struct bch_fs *c, u64 sectors_moved,
u64 keys_moved),
TP_ARGS(c, sectors_moved, keys_moved),
@ -667,9 +490,34 @@ TRACE_EVENT(tiering_end,
__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
);
DEFINE_EVENT(bkey, tiering_copy,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
TRACE_EVENT(copygc,
TP_PROTO(struct bch_dev *ca,
u64 sectors_moved, u64 sectors_not_moved,
u64 buckets_moved, u64 buckets_not_moved),
TP_ARGS(ca,
sectors_moved, sectors_not_moved,
buckets_moved, buckets_not_moved),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(u64, sectors_moved )
__field(u64, sectors_not_moved )
__field(u64, buckets_moved )
__field(u64, buckets_not_moved )
),
TP_fast_assign(
memcpy(__entry->uuid, ca->uuid.b, 16);
__entry->sectors_moved = sectors_moved;
__entry->sectors_not_moved = sectors_not_moved;
__entry->buckets_moved = buckets_moved;
__entry->buckets_not_moved = buckets_moved;
),
TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
__entry->uuid,
__entry->sectors_moved, __entry->sectors_not_moved,
__entry->buckets_moved, __entry->buckets_not_moved)
);
#endif /* _TRACE_BCACHE_H */

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,7 @@ struct bkey;
struct bucket;
struct bch_dev;
struct bch_fs;
struct dev_group;
struct bch_devs_List;
struct dev_alloc_list {
unsigned nr;
@ -24,33 +24,61 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
enum bucket_alloc_ret {
ALLOC_SUCCESS = 0,
OPEN_BUCKETS_EMPTY = -1,
FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
NO_DEVICES = -3, /* -EROFS */
};
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
if (atomic_dec_and_test(&ob->pin))
__bch2_open_bucket_put(c, ob);
}
static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
{
unsigned i;
for (i = 0; i < *nr; i++)
bch2_open_bucket_put(c, c->open_buckets + refs[i]);
*nr = 0;
}
static inline void bch2_open_bucket_get(struct bch_fs *c,
struct write_point *wp,
u8 *nr, u8 *refs)
{
unsigned i;
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i];
atomic_inc(&ob->pin);
refs[(*nr)++] = ob - c->open_buckets;
}
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
enum bch_data_type,
struct bch_devs_mask *,
unsigned long,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
unsigned, struct open_bucket *, unsigned);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i_extent *, unsigned);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
enum bch_data_type,
struct bch_devs_mask *,
unsigned long,
struct bkey_i_extent *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;
@ -61,10 +89,20 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
rcu_read_unlock();
}
#define open_bucket_for_each_ptr(_ob, _ptr) \
for ((_ptr) = (_ob)->ptrs; \
(_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \
(_ptr)++)
#define writepoint_for_each_ptr(_wp, _ob, _i) \
for ((_i) = 0; \
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
(_i)++)
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
return (struct write_point_specifier) { .v = v | 1 };
}
static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
{
return (struct write_point_specifier) { .v = (unsigned long) wp };
}
void bch2_recalc_capacity(struct bch_fs *);
@ -74,6 +112,13 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
static inline void writepoint_init(struct write_point *wp,
enum bch_data_type type)
{
mutex_init(&wp->lock);
wp->type = type;
}
void bch2_fs_allocator_init(struct bch_fs *);
extern const struct bkey_ops bch2_bkey_alloc_ops;

View File

@ -47,19 +47,14 @@ enum alloc_reserve {
#define OPEN_BUCKETS_COUNT 256
#define WRITE_POINT_COUNT 32
struct open_bucket_ptr {
struct bch_extent_ptr ptr;
unsigned sectors_free;
};
struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
u8 new_ob;
u8 nr_ptrs;
struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2];
bool valid;
bool on_partial_list;
unsigned sectors_free;
struct bch_extent_ptr ptr;
};
struct write_point {
@ -69,13 +64,23 @@ struct write_point {
unsigned long write_point;
enum bch_data_type type;
u8 nr_ptrs;
/*
* number of pointers in @ob we can't use, because we already had
* pointers to those devices:
*/
u8 nr_ptrs_can_use;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
struct open_bucket *ob;
struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
struct write_point_specifier {
unsigned long v;
};
struct alloc_heap_entry {
size_t bucket;
unsigned long key;

View File

@ -251,9 +251,6 @@ do { \
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
BCH_DEBUG_PARAM(version_stress_test, \
"Assigns random version numbers to newly written " \
"extents, to test overlapping extent cases") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
@ -310,8 +307,9 @@ struct crypto_blkcipher;
struct crypto_ahash;
enum gc_phase {
GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
GC_PHASE_SB = BTREE_ID_NR + 1,
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
GC_PHASE_DONE
};
@ -321,30 +319,6 @@ struct gc_pos {
unsigned level;
};
struct bch_member_cpu {
u64 nbuckets; /* device size */
u16 first_bucket; /* index of first bucket used */
u16 bucket_size; /* sectors */
u8 state;
u8 tier;
u8 replacement;
u8 discard;
u8 data_allowed;
u8 valid;
};
struct bch_replicas_cpu_entry {
u8 data_type;
u8 devs[BCH_SB_MEMBERS_MAX / 8];
};
struct bch_replicas_cpu {
struct rcu_head rcu;
unsigned nr;
unsigned entry_size;
struct bch_replicas_cpu_entry entries[];
};
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
@ -372,7 +346,7 @@ struct bch_dev {
struct bch_devs_mask self;
/* biosets used in cloned bios for replicas and moving_gc */
/* biosets used in cloned bios for writing multiple replicas */
struct bio_set replica_set;
struct task_struct *alloc_thread;
@ -392,7 +366,7 @@ struct bch_dev {
unsigned nr_invalidated;
bool alloc_thread_started;
struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
size_t fifo_last_bucket;
@ -422,18 +396,20 @@ struct bch_dev {
bool allocator_invalidating_data;
alloc_heap alloc_heap;
bucket_heap copygc_heap;
/* Moving GC: */
struct task_struct *moving_gc_read;
struct bch_pd_controller moving_gc_pd;
/* Copying GC: */
struct task_struct *copygc_thread;
copygc_heap copygc_heap;
struct bch_pd_controller copygc_pd;
struct write_point copygc_write_point;
struct journal_device journal;
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
atomic_t latency[2];
struct io_count __percpu *io_done;
};
@ -473,6 +449,7 @@ struct bch_tier {
struct bch_pd_controller pd;
struct bch_devs_mask devs;
struct write_point wp;
};
enum bch_fs_state {
@ -557,10 +534,7 @@ struct bch_fs {
* when allocating btree reserves fail halfway through) - instead, we
* can stick them here:
*/
struct btree_alloc {
struct open_bucket *ob;
BKEY_PADDED(k);
} btree_reserve_cache[BTREE_NODE_RESERVE * 2];
struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
unsigned btree_reserve_cache_nr;
struct mutex btree_reserve_cache_lock;
@ -573,15 +547,9 @@ struct bch_fs {
struct workqueue_struct *copygc_wq;
/* ALLOCATION */
struct rw_semaphore alloc_gc_lock;
struct bch_pd_controller foreground_write_pd;
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
spinlock_t foreground_write_pd_lock;
struct bch_write_op *write_wait_head;
struct bch_write_op *write_wait_tail;
struct timer_list foreground_write_wakeup;
/*
* These contain all r/w devices - i.e. devices we can currently
@ -622,8 +590,8 @@ struct bch_fs {
struct io_clock io_clock[2];
/* SECTOR ALLOCATOR */
spinlock_t open_buckets_lock;
/* ALLOCATOR */
spinlock_t freelist_lock;
u8 open_buckets_freelist;
u8 open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
@ -635,15 +603,6 @@ struct bch_fs {
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
struct mutex write_points_hash_lock;
/*
* This write point is used for migrating data off a device
* and can point to any other device.
* We can't use the normal write points because those will
* gang up n replicas, and for migration we want only one new
* replica.
*/
struct write_point migration_write_point;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
@ -688,6 +647,11 @@ struct bch_fs {
atomic64_t key_version;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
@ -728,19 +692,14 @@ struct bch_fs {
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
atomic_long_t extent_migrate_done;
atomic_long_t extent_migrate_raced;
unsigned btree_gc_periodic:1;
unsigned foreground_write_ratelimit_enabled:1;
unsigned copy_gc_enabled:1;
unsigned tiering_enabled:1;
unsigned tiering_percent;
/*
* foreground writes will be throttled when the number of free
* buckets is below this percentage
*/
unsigned foreground_target_percent;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM

View File

@ -344,11 +344,13 @@ struct bch_csum {
enum bch_csum_type {
BCH_CSUM_NONE = 0,
BCH_CSUM_CRC32C = 1,
BCH_CSUM_CRC64 = 2,
BCH_CSUM_CRC32C_NONZERO = 1,
BCH_CSUM_CRC64_NONZERO = 2,
BCH_CSUM_CHACHA20_POLY1305_80 = 3,
BCH_CSUM_CHACHA20_POLY1305_128 = 4,
BCH_CSUM_NR = 5,
BCH_CSUM_CRC32C = 5,
BCH_CSUM_CRC64 = 6,
BCH_CSUM_NR = 7,
};
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
@ -550,7 +552,7 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION);
/* Maximum possible size of an entire extent value: */
/* There's a hack in the keylist code that needs to be fixed.. */
#define BKEY_EXTENT_VAL_U64s_MAX \
(BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@ -734,11 +736,13 @@ BKEY_VAL_TYPE(alloc, BCH_ALLOC);
/*
* Version 8: BCH_SB_ENCODED_EXTENT_MAX_BITS
* BCH_MEMBER_DATA_ALLOWED
* Version 9: incompatible extent nonce change
*/
#define BCH_SB_VERSION_MIN 7
#define BCH_SB_VERSION_EXTENT_MAX 8
#define BCH_SB_VERSION_MAX 8
#define BCH_SB_VERSION_EXTENT_NONCE_V1 9
#define BCH_SB_VERSION_MAX 9
#define BCH_SB_SECTOR 8
#define BCH_SB_LABEL_SIZE 32

View File

@ -4,6 +4,14 @@
#include "bset.h"
#include "util.h"
#undef EBUG_ON
#ifdef DEBUG_BKEYS
#define EBUG_ON(cond) BUG_ON(cond)
#else
#define EBUG_ON(cond)
#endif
const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,

View File

@ -146,6 +146,17 @@
* first key in that range of bytes again.
*/
extern bool bch2_expensive_debug_checks;
static inline bool btree_keys_expensive_checks(const struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
return bch2_expensive_debug_checks || *b->expensive_debug_checks;
#else
return false;
#endif
}
struct btree_node_iter;
struct btree_node_iter_set;
@ -188,7 +199,7 @@ bkey_unpack_key_format_checked(const struct btree *b,
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(&dst, src);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
if (btree_keys_expensive_checks(b)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
/*
@ -260,17 +271,6 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
#define for_each_bset(_b, _t) \
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
extern bool bch2_expensive_debug_checks;
static inline bool btree_keys_expensive_checks(struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
return bch2_expensive_debug_checks || *b->expensive_debug_checks;
#else
return false;
#endif
}
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
{
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;

View File

@ -24,6 +24,7 @@
#include <linux/bitops.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <trace/events/bcachefs.h>
@ -111,19 +112,35 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
/*
* For runtime mark and sweep:
*/
static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k, unsigned flags)
{
struct gc_pos pos = { 0 };
struct bch_fs_usage *stats;
u8 ret = 0;
preempt_disable();
stats = this_cpu_ptr(c->usage_percpu);
switch (type) {
case BKEY_TYPE_BTREE:
bch2_gc_mark_key(c, k, c->opts.btree_node_size, true, flags);
return 0;
bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
0, flags|
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
break;
case BKEY_TYPE_EXTENTS:
bch2_gc_mark_key(c, k, k.k->size, false, flags);
return bch2_btree_key_recalc_oldest_gen(c, k);
bch2_mark_key(c, k, k.k->size, false, pos, stats,
0, flags|
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
ret = bch2_btree_key_recalc_oldest_gen(c, k);
break;
default:
BUG();
}
preempt_enable();
return ret;
}
int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
@ -182,7 +199,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
max_t(u64, k.k->version.lo,
atomic64_read(&c->key_version)));
bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
fsck_err:
return ret;
}
@ -200,7 +217,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
btree_node_is_extents(b),
&unpacked) {
bch2_bkey_debugcheck(c, b, k);
stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
}
return stale;
@ -267,123 +284,79 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b;
bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
mutex_unlock(&c->btree_root_lock);
return 0;
}
static void bch2_mark_allocator_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
struct open_bucket *ob;
const struct open_bucket_ptr *ptr;
size_t i, j, iter;
unsigned ci;
down_write(&c->alloc_gc_lock);
for_each_member_device(ca, c, ci) {
spin_lock(&ca->freelist_lock);
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
for (ptr = ca->open_buckets_partial;
ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
ptr++)
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
spin_unlock(&ca->freelist_lock);
}
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
open_bucket_for_each_ptr(ob, ptr) {
ca = c->devs[ptr->ptr.dev];
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
}
spin_unlock(&ob->lock);
}
up_write(&c->alloc_gc_lock);
}
static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
enum bucket_data_type type)
static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
u64 start, u64 end,
enum bucket_data_type type,
unsigned flags)
{
u64 b = sector_to_bucket(ca, start);
do {
bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true);
bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type,
gc_phase(GC_PHASE_SB), flags);
b++;
} while (b < sector_to_bucket(ca, end));
}
static void bch2_dev_mark_superblocks(struct bch_dev *ca)
void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
unsigned flags)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
unsigned i;
for (i = 0; i < layout->nr_superblocks; i++) {
if (layout->sb_offset[i] == BCH_SB_SECTOR)
mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
BUCKET_SB);
mark_metadata_sectors(ca,
layout->sb_offset[i],
layout->sb_offset[i] +
(1 << layout->sb_max_size_bits),
BUCKET_SB);
}
}
/*
* Mark non btree metadata - prios, journal
*/
void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
u64 b;
lockdep_assert_held(&c->sb_lock);
bch2_dev_mark_superblocks(ca);
for (i = 0; i < layout->nr_superblocks; i++) {
if (layout->sb_offset[i] == BCH_SB_SECTOR)
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
BUCKET_SB, flags);
mark_metadata_sectors(c, ca,
layout->sb_offset[i],
layout->sb_offset[i] +
(1 << layout->sb_max_size_bits),
BUCKET_SB, flags);
}
spin_lock(&c->journal.lock);
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
bch2_mark_metadata_bucket(ca, ca->buckets + b,
BUCKET_JOURNAL, true);
bch2_mark_metadata_bucket(c, ca, ca->buckets + b,
BUCKET_JOURNAL,
gc_phase(GC_PHASE_SB), flags);
}
spin_unlock(&c->journal.lock);
}
static void bch2_mark_metadata(struct bch_fs *c)
static void bch2_mark_superblocks(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
gc_pos_set(c, gc_phase(GC_PHASE_SB));
for_each_online_member(ca, c, i)
bch2_mark_dev_metadata(c, ca);
bch2_mark_dev_superblock(c, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
mutex_unlock(&c->sb_lock);
}
/* Also see bch2_pending_btree_node_free_insert_done() */
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
struct gc_pos pos = { 0 };
struct bch_fs_usage stats = { 0 };
struct btree_update *as;
struct pending_btree_node_free *d;
@ -393,10 +366,11 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done)
__bch2_mark_key(c, bkey_i_to_s_c(&d->key),
c->opts.btree_node_size, true,
bch2_mark_key(c, bkey_i_to_s_c(&d->key),
c->opts.btree_node_size, true, pos,
&stats, 0,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
@ -405,6 +379,51 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
mutex_unlock(&c->btree_interior_update_lock);
}
static void bch2_mark_allocator_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
struct open_bucket *ob;
size_t i, j, iter;
unsigned ci;
spin_lock(&c->freelist_lock);
gc_pos_set(c, gc_pos_alloc(c, NULL));
for_each_member_device(ca, c, ci) {
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
}
spin_unlock(&c->freelist_lock);
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
if (ob->valid) {
gc_pos_set(c, gc_pos_alloc(c, ob));
ca = c->devs[ob->ptr.dev];
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
}
spin_unlock(&ob->lock);
}
}
void bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
@ -495,9 +514,6 @@ void bch2_gc(struct bch_fs *c)
bch2_gc_start(c);
/* Walk allocator's references: */
bch2_mark_allocator_buckets(c);
/* Walk btree: */
while (c->gc_pos.phase < (int) BTREE_ID_NR) {
int ret = c->btree_roots[c->gc_pos.phase].b
@ -513,8 +529,9 @@ void bch2_gc(struct bch_fs *c)
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
}
bch2_mark_metadata(c);
bch2_mark_superblocks(c);
bch2_mark_pending_btree_node_frees(c);
bch2_mark_allocator_buckets(c);
for_each_member_device(ca, c, i)
atomic_long_set(&ca->saturated_count, 0);
@ -570,7 +587,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
struct bkey_format new_format;
memset(new_nodes, 0, sizeof(new_nodes));
bch2_keylist_init(&keylist, NULL, 0);
bch2_keylist_init(&keylist, NULL);
/* Count keys that are not deleted */
for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
@ -1023,8 +1040,6 @@ again:
if (ret)
return ret;
bch2_mark_metadata(c);
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
@ -1043,6 +1058,8 @@ again:
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
bch2_mark_superblocks(c);
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);

View File

@ -13,7 +13,7 @@ int bch2_initial_gc(struct bch_fs *, struct list_head *);
u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
/*
* For concurrent mark and sweep (with other index updates), we define a total
@ -88,6 +88,14 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
};
}
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
{
return (struct gc_pos) {
.phase = GC_PHASE_ALLOC,
.pos = POS(ob ? ob - c->open_buckets : 0, 0),
};
}
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;

View File

@ -146,9 +146,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
BUG_ON(iter->data->k > iter->data->end);
if (iter->data->k == iter->data->end)
memmove(&iter->data[0],
&iter->data[1],
sizeof(iter->data[0]) * --iter->used);
array_remove_item(iter->data, iter->used, 0);
else
sort_iter_sift(iter, cmp);
}
@ -1307,6 +1305,8 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
INIT_WORK(&rb->work, btree_node_read_work);
schedule_work(&rb->work);
}
@ -1471,6 +1471,8 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
bch2_meta_write_fault("btree"))
set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);

View File

@ -10,6 +10,7 @@ struct btree_iter;
struct btree_read_bio {
struct bch_fs *c;
unsigned submit_time_us;
u64 start_time;
struct extent_pick_ptr pick;
struct work_struct work;

View File

@ -91,7 +91,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
int lock_type = btree_node_locked_type(iter, level);
EBUG_ON(iter->flags & BTREE_ITER_UPTODATE);
EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
if (lock_type != BTREE_NODE_UNLOCKED)
six_unlock_type(&iter->nodes[level]->lock, lock_type);

View File

@ -55,6 +55,16 @@ struct btree_write {
struct closure_waitlist wait;
};
struct btree_ob_ref {
u8 nr;
u8 refs[BCH_REPLICAS_MAX];
};
struct btree_alloc {
struct btree_ob_ref ob;
BKEY_PADDED(k);
};
struct btree {
/* Hottest entries first */
struct rhash_head hash;
@ -118,7 +128,7 @@ struct btree {
*/
struct btree_update *will_make_reachable;
struct open_bucket *ob;
struct btree_ob_ref ob;
/* lru list */
struct list_head list;
@ -317,18 +327,6 @@ struct btree_root {
struct btree_iter;
struct btree_node_iter;
enum extent_insert_hook_ret {
BTREE_HOOK_DO_INSERT,
BTREE_HOOK_NO_INSERT,
BTREE_HOOK_RESTART_TRANS,
};
struct extent_insert_hook {
enum extent_insert_hook_ret
(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
struct bkey_s_c, const struct bkey_i *);
};
enum btree_insert_ret {
BTREE_INSERT_OK,
/* extent spanned multiple leaf nodes: have to traverse to next node: */
@ -342,6 +340,12 @@ enum btree_insert_ret {
BTREE_INSERT_NEED_GC_LOCK,
};
struct extent_insert_hook {
enum btree_insert_ret
(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
struct bkey_s_c, const struct bkey_i *);
};
enum btree_gc_coalesce_fail_reason {
BTREE_GC_COALESCE_FAIL_RESERVE_GET,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,

View File

@ -211,7 +211,7 @@ found:
-c->opts.btree_node_size, true, b
? gc_pos_btree_node(b)
: gc_pos_btree_root(as->btree_id),
&tmp, 0);
&tmp, 0, 0);
/*
* Don't apply tmp - pending deletes aren't tracked in
* bch_alloc_stats:
@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
BUG_ON(b == btree_node_root(c, b));
BUG_ON(b->ob);
BUG_ON(b->ob.nr);
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable);
@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
{
struct open_bucket *ob = b->ob;
struct btree_ob_ref ob = b->ob;
btree_update_drop_new_node(c, b);
b->ob = NULL;
b->ob.nr = 0;
clear_btree_node_dirty(b);
__btree_node_free(c, b, NULL);
bch2_open_bucket_put(c, ob);
bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
}
void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
@ -287,7 +287,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-c->opts.btree_node_size, true,
gc_phase(GC_PHASE_PENDING_DELETE),
&stats, 0);
&stats, 0, 0);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
{
bch2_open_bucket_put(c, b->ob);
b->ob = NULL;
bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct closure *cl,
unsigned flags)
{
BKEY_PADDED(k) tmp;
struct open_bucket *ob;
struct write_point *wp;
struct btree *b;
BKEY_PADDED(k) tmp;
struct bkey_i_extent *e;
struct btree_ob_ref ob;
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
@ -335,31 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
/* alloc_sectors is weird, I suppose */
bkey_extent_init(&tmp.k);
tmp.k.k.size = c->opts.btree_node_size,
ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
bkey_i_to_extent(&tmp.k),
wp = bch2_alloc_sectors_start(c, NULL,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
c->opts.metadata_replicas_required,
alloc_reserve, 0, cl);
if (IS_ERR(ob))
return ERR_CAST(ob);
if (IS_ERR(wp))
return ERR_CAST(wp);
if (tmp.k.k.size < c->opts.btree_node_size) {
bch2_open_bucket_put(c, ob);
if (wp->sectors_free < c->opts.btree_node_size) {
struct open_bucket *ob;
unsigned i;
writepoint_for_each_ptr(wp, ob, i)
if (ob->sectors_free < c->opts.btree_node_size)
ob->sectors_free = 0;
bch2_alloc_sectors_done(c, wp);
goto retry;
}
e = bkey_extent_init(&tmp.k);
bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
ob.nr = 0;
bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(c);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
BUG_ON(b->ob);
BUG_ON(b->ob.nr);
bkey_copy(&b->key, &tmp.k);
b->key.k.size = 0;
b->ob = ob;
return b;
@ -466,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
a->ob = b->ob;
b->ob = NULL;
b->ob.nr = 0;
bkey_copy(&a->k, &b->key);
} else {
bch2_open_bucket_put(c, b->ob);
b->ob = NULL;
bch2_btree_open_bucket_put(c, b);
}
__btree_node_free(c, b, NULL);
@ -857,10 +868,7 @@ static void __btree_interior_update_drop_new_node(struct btree *b)
BUG();
found:
as->nr_new_nodes--;
memmove(&as->new_nodes[i],
&as->new_nodes[i + 1],
sizeof(struct btree *) * (as->nr_new_nodes - i));
array_remove_item(as->new_nodes, as->nr_new_nodes, i);
b->will_make_reachable = NULL;
}
@ -1000,8 +1008,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
as->reserve = reserve;
INIT_LIST_HEAD(&as->write_blocked_list);
bch2_keylist_init(&as->parent_keys, as->inline_keys,
ARRAY_SIZE(as->inline_keys));
bch2_keylist_init(&as->parent_keys, as->inline_keys);
mutex_lock(&c->btree_interior_update_lock);
list_add(&as->list, &c->btree_interior_update_list);
@ -1037,7 +1044,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
bch2_mark_key(c, bkey_i_to_s_c(&b->key),
c->opts.btree_node_size, true,
gc_pos_btree_root(b->btree_id),
&stats, 0);
&stats, 0, 0);
if (old)
bch2_btree_node_free_index(as, NULL,
@ -1121,7 +1128,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
if (bkey_extent_is_data(&insert->k))
bch2_mark_key(c, bkey_i_to_s_c(insert),
c->opts.btree_node_size, true,
gc_pos_btree_node(b), &stats, 0);
gc_pos_btree_node(b), &stats, 0, 0);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
!btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
@ -1479,6 +1486,13 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
struct closure cl;
int ret = 0;
/*
* We already have a disk reservation and open buckets pinned; this
* allocation must not block:
*/
if (iter->btree_id == BTREE_ID_EXTENTS)
btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
@ -1519,6 +1533,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_set_locks_want(iter, 1);
out:
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
}
@ -1904,7 +1919,7 @@ retry:
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true,
gc_pos_btree_root(b->btree_id),
&stats, 0);
&stats, 0, 0);
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
&stats);
@ -1928,6 +1943,7 @@ out:
}
bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
err:
if (as)
@ -1965,13 +1981,13 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE,
&cl);
closure_sync(&cl);
if (!IS_ERR(as))
break;
if (PTR_ERR(as) == -ENOSPC)
return PTR_ERR(as);
closure_sync(&cl);
}
b = __btree_root_alloc(as, 0);

View File

@ -355,6 +355,11 @@ retry:
multi_lock_write(c, trans);
if (race_fault()) {
ret = -EINTR;
goto unlock;
}
u64s = 0;
trans_for_each_entry(trans, i) {
/* Multiple inserts might go to same leaf: */

View File

@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
stats.online_reserved);
}
static void bch2_dev_stats_verify(struct bch_dev *ca)
{
struct bch_dev_usage stats =
__bch2_dev_usage_read(ca);
u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
BUG_ON(stats.buckets[S_META] > n);
BUG_ON(stats.buckets[S_DIRTY] > n);
BUG_ON(stats.buckets_cached > n);
BUG_ON(stats.buckets_alloc > n);
BUG_ON(stats.buckets_unavailable > n);
}
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
{
if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
u64 used = __bch2_fs_sectors_used(c);
u64 cached = 0;
u64 avail = atomic64_read(&c->sectors_available);
int cpu;
for_each_possible_cpu(cpu)
cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
if (used + avail + cached > c->capacity)
panic("used %llu avail %llu cached %llu capacity %llu\n",
used, avail, cached, c->capacity);
}
}
#else
static void bch2_fs_stats_verify(struct bch_fs *c) {}
static void bch2_dev_stats_verify(struct bch_dev *ca) {}
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
#endif
@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
return bch2_usage_read_raw(ca->usage_percpu);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_usage_read_cached(ca->fs,
ca->usage_cached,
ca->usage_percpu);
return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
}
struct bch_fs_usage
@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m)
!m.dirty_sectors && !!m.cached_sectors;
}
static inline int is_unavailable_bucket(struct bucket_mark m)
{
return !is_available_bucket(m);
}
static inline enum s_alloc bucket_type(struct bucket_mark m)
{
return is_meta_bucket(m) ? S_META : S_DIRTY;
@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c,
memset(stats, 0, sizeof(*stats));
}
static void bch2_dev_usage_update(struct bch_dev *ca,
struct bucket_mark old, struct bucket_mark new)
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g, struct bucket_mark old,
struct bucket_mark new)
{
struct bch_fs *c = ca->fs;
struct bch_dev_usage *dev_usage;
BUG_ON((g - ca->buckets) < ca->mi.first_bucket ||
(g - ca->buckets) >= ca->mi.nbuckets);
bch2_fs_inconsistent_on(old.data_type && new.data_type &&
old.data_type != new.data_type, c,
"different types of metadata in same bucket: %u, %u",
@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
preempt_disable();
dev_usage = this_cpu_ptr(ca->usage_percpu);
dev_usage->sectors_cached +=
(int) new.cached_sectors - (int) old.cached_sectors;
dev_usage->buckets[S_META] +=
is_meta_bucket(new) - is_meta_bucket(old);
dev_usage->buckets[S_DIRTY] +=
is_dirty_bucket(new) - is_dirty_bucket(old);
dev_usage->buckets_cached +=
is_cached_bucket(new) - is_cached_bucket(old);
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
dev_usage->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
dev_usage->sectors_cached +=
(int) new.cached_sectors - (int) old.cached_sectors;
preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
bch2_dev_stats_verify(ca);
}
#define bucket_data_cmpxchg(ca, g, new, expr) \
#define bucket_data_cmpxchg(c, ca, g, new, expr) \
({ \
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
\
bch2_dev_usage_update(ca, _old, new); \
bch2_dev_usage_update(c, ca, g, _old, new); \
_old; \
})
bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
struct bucket_mark *old)
bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g, struct bucket_mark *old)
{
struct bucket_mark new;
*old = bucket_data_cmpxchg(ca, g, new, ({
lg_local_lock(&c->usage_lock);
*old = bucket_data_cmpxchg(c, ca, g, new, ({
if (!is_available_bucket(new))
return false;
@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
new.dirty_sectors = 0;
new.gen++;
}));
lg_local_unlock(&c->usage_lock);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
return true;
}
bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g)
{
struct bucket_mark new, old;
old = bucket_data_cmpxchg(ca, g, new, ({
lg_local_lock(&c->usage_lock);
old = bucket_data_cmpxchg(c, ca, g, new, ({
if (new.touched_this_mount ||
!is_available_bucket(new))
return false;
@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
new.owned_by_allocator = 1;
new.touched_this_mount = 1;
}));
lg_local_unlock(&c->usage_lock);
return true;
}
void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
struct bucket_mark old, new;
old = bucket_data_cmpxchg(ca, g, new, ({
new.touched_this_mount = 1;
new.owned_by_allocator = 0;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
BUG_ON(bucket_became_unavailable(ca->fs, old, new));
lg_local_lock(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos)) {
lg_local_unlock(&c->usage_lock);
return;
}
void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
bool owned_by_allocator)
{
struct bucket_mark old, new;
old = bucket_data_cmpxchg(ca, g, new, ({
old = bucket_data_cmpxchg(c, ca, g, new, ({
new.touched_this_mount = 1;
new.owned_by_allocator = owned_by_allocator;
}));
lg_local_unlock(&c->usage_lock);
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
ca->fs->gc_pos.phase == GC_PHASE_DONE);
c->gc_pos.phase == GC_PHASE_DONE);
}
#define saturated_add(ca, dst, src, max) \
@ -377,41 +419,49 @@ do { \
} \
} while (0)
void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
enum bucket_data_type type,
bool may_make_unavailable)
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g, enum bucket_data_type type,
struct gc_pos pos, unsigned flags)
{
struct bucket_mark old, new;
BUG_ON(!type);
old = bucket_data_cmpxchg(ca, g, new, ({
lg_local_lock(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos)) {
lg_local_unlock(&c->usage_lock);
return;
}
old = bucket_data_cmpxchg(c, ca, g, new, ({
saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
GC_MAX_SECTORS_USED);
new.data_type = type;
new.touched_this_mount = 1;
}));
lg_local_unlock(&c->usage_lock);
if (old.data_type != type &&
(old.data_type ||
old.cached_sectors ||
old.dirty_sectors))
bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
g - ca->buckets, old.data_type, new.data_type);
BUG_ON(!may_make_unavailable &&
bucket_became_unavailable(ca->fs, old, new));
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
}
/* Reverting this until the copygc + compression issue is fixed: */
static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
{
if (!sectors)
return 0;
return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
crc_uncompressed_size(NULL, crc)));
return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
crc.uncompressed_size));
}
/*
@ -421,8 +471,8 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
*/
static void bch2_mark_pointer(struct bch_fs *c,
struct bkey_s_c_extent e,
const union bch_extent_crc *crc,
const struct bch_extent_ptr *ptr,
struct bch_extent_crc_unpacked crc,
s64 sectors, enum s_alloc type,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
? BUCKET_BTREE : BUCKET_DATA;
u64 v;
if (crc_compression_type(crc)) {
if (crc.compression_type) {
unsigned old_sectors, new_sectors;
if (sectors > 0) {
@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
old.counter,
new.counter)) != old.counter);
bch2_dev_usage_update(ca, old, new);
bch2_dev_usage_update(c, ca, g, old, new);
if (old.data_type != data_type &&
(old.data_type ||
old.cached_sectors ||
old.dirty_sectors))
bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
g - ca->buckets, old.data_type, new.data_type);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
}
}
static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
s64 sectors, bool metadata,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc;
enum s_alloc type = metadata ? S_META : S_DIRTY;
unsigned replicas = 0;
BUG_ON(metadata && bkey_extent_is_cached(e.k));
BUG_ON(!sectors);
extent_for_each_ptr_crc(e, ptr, crc) {
bch2_mark_pointer(c, e, crc, ptr, sectors, type,
stats, journal_seq, flags);
replicas += !ptr->cached;
}
BUG_ON(replicas >= BCH_REPLICAS_MAX);
if (replicas)
stats->s[replicas - 1].data[type] += sectors;
}
void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
stats, journal_seq, flags);
break;
case BCH_RESERVATION: {
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
if (r.v->nr_replicas)
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
break;
}
}
}
void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata, unsigned flags)
{
struct bch_fs_usage stats = { 0 };
__bch2_mark_key(c, k, sectors, metadata, &stats, 0,
flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
preempt_disable();
bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
preempt_enable();
}
void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata, struct gc_pos gc_pos,
struct bch_fs_usage *stats, u64 journal_seq)
s64 sectors, bool metadata,
struct gc_pos pos,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
unsigned flags = gc_will_visit(c, gc_pos)
? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
/*
* synchronization w.r.t. GC:
*
@ -614,50 +605,87 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
* To know whether we should mark a given reference (GC either isn't
* running, or has already marked references at this position) we
* construct a total order for everything GC walks. Then, we can simply
* compare the position of the reference we're marking - @gc_pos - with
* compare the position of the reference we're marking - @pos - with
* GC's current position. If GC is going to mark this reference, GC's
* current position will be less than @gc_pos; if GC's current position
* is greater than @gc_pos GC has either already walked this position,
* or isn't running.
* current position will be less than @pos; if GC's current position is
* greater than @pos GC has either already walked this position, or
* isn't running.
*
* To avoid racing with GC's position changing, we have to deal with
* - GC's position being set to GC_POS_MIN when GC starts:
* usage_lock guards against this
* - GC's position overtaking @gc_pos: we guard against this with
* - GC's position overtaking @pos: we guard against this with
* whatever lock protects the data structure the reference lives in
* (e.g. the btree node lock, or the relevant allocator lock).
*/
lg_local_lock(&c->usage_lock);
__bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
bch2_fs_stats_verify(c);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
enum s_alloc type = metadata ? S_META : S_DIRTY;
unsigned replicas = 0;
BUG_ON(metadata && bkey_extent_is_cached(e.k));
BUG_ON(!sectors);
extent_for_each_ptr_crc(e, ptr, crc) {
bch2_mark_pointer(c, e, ptr, crc, sectors, type,
stats, journal_seq, flags);
replicas += !ptr->cached;
}
BUG_ON(replicas >= BCH_REPLICAS_MAX);
if (replicas)
stats->s[replicas - 1].data[type] += sectors;
break;
}
case BCH_RESERVATION: {
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
if (r.v->nr_replicas)
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
break;
}
}
lg_local_unlock(&c->usage_lock);
}
/* Disk reservations: */
static u64 __recalc_sectors_available(struct bch_fs *c)
{
return c->capacity - bch2_fs_sectors_used(c);
u64 avail;
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
avail = c->capacity - bch2_fs_sectors_used(c);
avail <<= RESERVE_FACTOR;
avail /= (1 << RESERVE_FACTOR) + 1;
return avail;
}
/* Used by gc when it's starting: */
void bch2_recalc_sectors_available(struct bch_fs *c)
{
int cpu;
lg_global_lock(&c->usage_lock);
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
atomic64_set(&c->sectors_available,
__recalc_sectors_available(c));
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
lg_global_unlock(&c->usage_lock);
}
void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
if (res->sectors) {
lg_local_lock(&c->usage_lock);
this_cpu_sub(c->usage_percpu->online_reserved,
res->sectors);
@ -667,16 +695,14 @@ void bch2_disk_reservation_put(struct bch_fs *c,
res->sectors = 0;
}
}
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c,
struct disk_reservation *res,
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
unsigned sectors, int flags)
{
struct bch_fs_usage *stats;
u64 old, new, v;
u64 old, v, get;
s64 sectors_available;
int ret;
@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c,
lg_local_lock(&c->usage_lock);
stats = this_cpu_ptr(c->usage_percpu);
if (sectors >= stats->available_cache)
if (sectors <= stats->available_cache)
goto out;
v = atomic64_read(&c->sectors_available);
do {
old = v;
if (old < sectors) {
get = min((u64) sectors + SECTORS_CACHE, old);
if (get < sectors) {
lg_local_unlock(&c->usage_lock);
goto recalculate;
}
new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
} while ((v = atomic64_cmpxchg(&c->sectors_available,
old, new)) != old);
old, old - get)) != old);
stats->available_cache += get;
stats->available_cache += old - new;
out:
stats->available_cache -= sectors;
stats->online_reserved += sectors;
res->sectors += sectors;
bch2_disk_reservations_verify(c, flags);
bch2_fs_stats_verify(c);
lg_local_unlock(&c->usage_lock);
return 0;
@ -738,6 +766,8 @@ recalculate:
stats->online_reserved += sectors;
res->sectors += sectors;
ret = 0;
bch2_disk_reservations_verify(c, flags);
} else {
atomic64_set(&c->sectors_available, sectors_available);
ret = -ENOSPC;

View File

@ -95,24 +95,26 @@ static inline bool bucket_unused(struct bucket_mark mark)
/* Per device stats: */
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
return max_t(s64, 0,
ca->mi.nbuckets - ca->mi.first_bucket -
stats.buckets[S_META] -
stats.buckets[S_DIRTY] -
stats.buckets_alloc);
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow\n"))
return 0;
return total - stats.buckets_unavailable;
}
/*
* Number of reclaimable buckets - only for use by the allocator thread:
*/
static inline u64 dev_buckets_available(struct bch_dev *ca)
static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
}
static inline u64 __dev_buckets_free(struct bch_dev *ca,
@ -123,9 +125,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca,
fifo_used(&ca->free_inc);
}
static inline u64 dev_buckets_free(struct bch_dev *ca)
static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
{
return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
}
/* Cache set stats: */
@ -155,11 +157,18 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
return sum;
}
#define RESERVE_FACTOR 6
static u64 reserve_factor(u64 r)
{
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
}
static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
{
struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));
return sum.data + sum.reserved + (sum.reserved >> 7);
return sum.data + reserve_factor(sum.reserved);
}
static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
@ -184,30 +193,35 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
void bch2_bucket_seq_cleanup(struct bch_fs *);
bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
struct bucket_mark *);
bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
enum bucket_data_type, bool);
bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
struct bucket *, struct bucket_mark *);
bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
struct bucket *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
struct bucket *, bool,
struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
struct bucket *, enum bucket_data_type,
struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 1)
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 2)
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
s64, bool, unsigned);
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
struct gc_pos, struct bch_fs_usage *, u64);
void bch2_recalc_sectors_available(struct bch_fs *);
void bch2_disk_reservation_put(struct bch_fs *,
struct disk_reservation *);
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
if (res->sectors)
__bch2_disk_reservation_put(c, res);
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
#define BCH_DISK_RESERVATION_METADATA (1 << 1)

View File

@ -59,6 +59,7 @@ struct bch_dev_usage {
u64 buckets[S_ALLOC_NR];
u64 buckets_cached;
u64 buckets_alloc;
u64 buckets_unavailable;
/* _compressed_ sectors: */
u64 sectors[S_ALLOC_NR];
@ -79,13 +80,6 @@ struct bch_fs_usage {
u64 available_cache;
};
struct bucket_heap_entry {
size_t bucket;
struct bucket_mark mark;
};
typedef HEAP(struct bucket_heap_entry) bucket_heap;
/*
* A reservation for space on disk:
*/
@ -95,4 +89,11 @@ struct disk_reservation {
unsigned nr_replicas;
};
struct copygc_heap_entry {
u64 offset;
struct bucket_mark mark;
};
typedef HEAP(struct copygc_heap_entry) copygc_heap;
#endif /* _BUCKETS_TYPES_H */

View File

@ -141,10 +141,14 @@ static u64 bch2_checksum_init(unsigned type)
switch (type) {
case BCH_CSUM_NONE:
return 0;
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC32C_NONZERO:
return U32_MAX;
case BCH_CSUM_CRC64:
case BCH_CSUM_CRC64_NONZERO:
return U64_MAX;
case BCH_CSUM_CRC32C:
return 0;
case BCH_CSUM_CRC64:
return 0;
default:
BUG();
}
@ -155,10 +159,14 @@ static u64 bch2_checksum_final(unsigned type, u64 crc)
switch (type) {
case BCH_CSUM_NONE:
return 0;
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC32C_NONZERO:
return crc ^ U32_MAX;
case BCH_CSUM_CRC64:
case BCH_CSUM_CRC64_NONZERO:
return crc ^ U64_MAX;
case BCH_CSUM_CRC32C:
return crc;
case BCH_CSUM_CRC64:
return crc;
default:
BUG();
}
@ -169,8 +177,10 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
switch (type) {
case BCH_CSUM_NONE:
return 0;
case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC32C:
return crc32c(crc, data, len);
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC64:
return bch2_crc64_update(crc, data, len);
default:
@ -243,6 +253,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
{
switch (type) {
case BCH_CSUM_NONE:
case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64: {
u64 crc = bch2_checksum_init(type);
@ -250,7 +262,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
crc = bch2_checksum_update(type, crc, data, len);
crc = bch2_checksum_final(type, crc);
return (struct bch_csum) { .lo = crc };
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
}
case BCH_CSUM_CHACHA20_POLY1305_80:
@ -281,28 +293,36 @@ void bch2_encrypt(struct bch_fs *c, unsigned type,
do_encrypt(c->chacha20, nonce, data, len);
}
struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio,
struct bvec_iter *iter)
{
struct bio_vec bv;
struct bvec_iter iter;
switch (type) {
case BCH_CSUM_NONE:
return (struct bch_csum) { 0 };
case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64: {
u64 crc = bch2_checksum_init(type);
bio_for_each_contig_segment(bv, bio, iter) {
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
crc = bch2_checksum_update(type,
crc, p, bv.bv_len);
kunmap_atomic(p);
}
#else
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
crc = bch2_checksum_final(type, crc);
return (struct bch_csum) { .lo = crc };
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
}
case BCH_CSUM_CHACHA20_POLY1305_80:
@ -313,13 +333,19 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
gen_poly_key(c, desc, nonce);
bio_for_each_contig_segment(bv, bio, iter) {
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
crypto_shash_update(desc, p, bv.bv_len);
kunmap_atomic(p);
}
#else
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
crypto_shash_final(desc, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
@ -330,6 +356,14 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
}
}
struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
struct bvec_iter iter = bio->bi_iter;
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
@ -343,12 +377,12 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
sg_init_table(sgl, ARRAY_SIZE(sgl));
bio_for_each_contig_segment(bv, bio, iter) {
bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
nonce = nonce_add(nonce, bytes);
bytes = 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
@ -357,13 +391,115 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
bytes += bv.bv_len;
}
sg_mark_end(sg - 1);
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
}
static inline bool bch2_checksum_mergeable(unsigned type)
{
switch (type) {
case BCH_CSUM_NONE:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64:
return true;
default:
return false;
}
}
static struct bch_csum bch2_checksum_merge(unsigned type,
struct bch_csum a,
struct bch_csum b, size_t b_len)
{
BUG_ON(!bch2_checksum_mergeable(type));
while (b_len) {
unsigned b = min(b_len, PAGE_SIZE);
a.lo = bch2_checksum_update(type, a.lo,
page_address(ZERO_PAGE(0)), b);
b_len -= b;
}
a.lo ^= b.lo;
a.hi ^= b.hi;
return a;
}
int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
struct bversion version,
struct bch_extent_crc_unpacked crc_old,
struct bch_extent_crc_unpacked *crc_a,
struct bch_extent_crc_unpacked *crc_b,
unsigned len_a, unsigned len_b,
unsigned new_csum_type)
{
struct bvec_iter iter = bio->bi_iter;
struct nonce nonce = extent_nonce(version, crc_old);
struct bch_csum merged = { 0 };
struct crc_split {
struct bch_extent_crc_unpacked *crc;
unsigned len;
unsigned csum_type;
struct bch_csum csum;
} splits[3] = {
{ crc_a, len_a, new_csum_type },
{ crc_b, len_b, new_csum_type },
{ NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
}, *i;
bool mergeable = crc_old.csum_type == new_csum_type &&
bch2_checksum_mergeable(new_csum_type);
unsigned crc_nonce = crc_old.nonce;
BUG_ON(len_a + len_b > bio_sectors(bio));
BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
BUG_ON(crc_old.compression_type);
BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
bch2_csum_type_is_encryption(new_csum_type));
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
iter.bi_size = i->len << 9;
if (mergeable || i->crc)
i->csum = __bch2_checksum_bio(c, i->csum_type,
nonce, bio, &iter);
else
bio_advance_iter(bio, &iter, i->len << 9);
nonce = nonce_add(nonce, i->len << 9);
}
if (mergeable)
for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
merged = bch2_checksum_merge(new_csum_type, merged,
i->csum, i->len << 9);
else
merged = bch2_checksum_bio(c, crc_old.csum_type,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum))
return -EIO;
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
if (i->crc)
*i->crc = (struct bch_extent_crc_unpacked) {
.csum_type = i->csum_type,
.compressed_size = i->len,
.uncompressed_size = i->len,
.offset = 0,
.live_size = i->len,
.nonce = crc_nonce,
.csum = i->csum,
};
if (bch2_csum_type_is_encryption(new_csum_type))
crc_nonce += i->len;
}
return 0;
}
#ifdef __KERNEL__
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
{

View File

@ -2,6 +2,7 @@
#define _BCACHEFS_CHECKSUM_H
#include "bcachefs.h"
#include "extents_types.h"
#include "super-io.h"
#include <crypto/chacha20.h>
@ -37,6 +38,13 @@ void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
struct bch_extent_crc_unpacked,
struct bch_extent_crc_unpacked *,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
void bch2_encrypt_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
@ -49,15 +57,16 @@ int bch2_enable_encryption(struct bch_fs *, bool);
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type)
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
bool data)
{
switch (type) {
case BCH_CSUM_OPT_NONE:
return BCH_CSUM_NONE;
case BCH_CSUM_OPT_CRC32C:
return BCH_CSUM_CRC32C;
return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
case BCH_CSUM_OPT_CRC64:
return BCH_CSUM_CRC64;
return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
default:
BUG();
}
@ -70,7 +79,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80;
return bch2_csum_opt_to_type(c->opts.data_checksum);
return bch2_csum_opt_to_type(c->opts.data_checksum, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@ -78,7 +87,7 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
if (c->sb.encryption_type)
return BCH_CSUM_CHACHA20_POLY1305_128;
return bch2_csum_opt_to_type(c->opts.metadata_checksum);
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
static inline enum bch_compression_type
@ -134,6 +143,21 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
return nonce;
}
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{
unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
struct nonce nonce = (struct nonce) {{
[0] = cpu_to_le32(size << 22),
[1] = cpu_to_le32(version.lo),
[2] = cpu_to_le32(version.lo >> 32),
[3] = cpu_to_le32(version.hi|
(crc.compression_type << 24))^BCH_NONCE_EXTENT,
}};
return nonce_add(nonce, crc.nonce << 9);
}
static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
{
return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;

View File

@ -1,4 +1,5 @@
#include "bcachefs.h"
#include "checksum.h"
#include "compress.h"
#include "extents.h"
#include "io.h"
@ -145,11 +146,11 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
}
static int __bio_uncompress(struct bch_fs *c, struct bio *src,
void *dst_data, struct bch_extent_crc128 crc)
void *dst_data, struct bch_extent_crc_unpacked crc)
{
struct bbuf src_data = { NULL };
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
size_t dst_len = crc.uncompressed_size << 9;
int ret;
src_data = bio_map_or_bounce(c, src, READ);
@ -212,65 +213,58 @@ err:
}
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
unsigned live_data_sectors,
struct bch_extent_crc128 crc)
struct bch_extent_crc_unpacked *crc)
{
struct bbuf dst_data = { NULL };
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
int ret = -ENOMEM;
struct bbuf data = { NULL };
size_t dst_len = crc->uncompressed_size << 9;
BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
/* bio must own its pages: */
BUG_ON(!bio->bi_vcnt);
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
if (crc->uncompressed_size > c->sb.encoded_extent_max ||
crc->compressed_size > c->sb.encoded_extent_max) {
bch_err(c, "error rewriting existing data: extent too big");
return -EIO;
dst_data = __bounce_alloc(c, dst_len, WRITE);
ret = __bio_uncompress(c, bio, dst_data.b, crc);
if (ret)
goto err;
while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
bv->bv_page = alloc_page(GFP_NOIO);
if (!bv->bv_page)
goto use_mempool;
bv->bv_len = PAGE_SIZE;
bv->bv_offset = 0;
bio->bi_vcnt++;
}
bio->bi_iter.bi_size = live_data_sectors << 9;
copy_data:
memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9));
err:
bio_unmap_or_unbounce(c, dst_data);
return ret;
use_mempool:
/*
* We already allocated from mempool, we can't allocate from it again
* without freeing the pages we already allocated or else we could
* deadlock:
*/
data = __bounce_alloc(c, dst_len, WRITE);
bch2_bio_free_pages_pool(c, bio);
bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
goto copy_data;
if (__bio_uncompress(c, bio, data.b, *crc)) {
bch_err(c, "error rewriting existing data: decompression error");
bio_unmap_or_unbounce(c, data);
return -EIO;
}
/*
* might have to free existing pages and retry allocation from mempool -
* do this _after_ decompressing:
*/
bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
crc->csum_type = 0;
crc->compression_type = 0;
crc->compressed_size = crc->live_size;
crc->uncompressed_size = crc->live_size;
crc->offset = 0;
crc->csum = (struct bch_csum) { 0, 0 };
bio_unmap_or_unbounce(c, data);
return 0;
}
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
struct bio *dst, struct bvec_iter dst_iter,
struct bch_extent_crc128 crc)
struct bch_extent_crc_unpacked crc)
{
struct bbuf dst_data = { NULL };
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
size_t dst_len = crc.uncompressed_size << 9;
int ret = -ENOMEM;
if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
if (crc.uncompressed_size > c->sb.encoded_extent_max ||
crc.compressed_size > c->sb.encoded_extent_max)
return -EIO;
dst_data = dst_len == dst_iter.bi_size
@ -288,21 +282,25 @@ err:
return ret;
}
static int __bio_compress(struct bch_fs *c,
static unsigned __bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
unsigned *compression_type)
unsigned compression_type)
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
unsigned pad;
int ret = 0;
/* If it's only one block, don't bother trying to compress: */
if (bio_sectors(src) <= c->opts.block_size)
goto err;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
switch (*compression_type) {
switch (compression_type) {
case BCH_COMPRESSION_LZ4_OLD:
*compression_type = BCH_COMPRESSION_LZ4;
compression_type = BCH_COMPRESSION_LZ4;
case BCH_COMPRESSION_LZ4: {
void *workspace;
@ -403,19 +401,24 @@ zlib_err:
if (dst_data.type != BB_NONE)
memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
BUG_ON(*dst_len & (block_bytes(c) - 1));
BUG_ON(*src_len & (block_bytes(c) - 1));
out:
bio_unmap_or_unbounce(c, src_data);
bio_unmap_or_unbounce(c, dst_data);
return ret;
return compression_type;
err:
ret = -1;
compression_type = 0;
goto out;
}
void bch2_bio_compress(struct bch_fs *c,
unsigned bch2_bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
unsigned *compression_type)
unsigned compression_type)
{
unsigned orig_dst = dst->bi_iter.bi_size;
unsigned orig_src = src->bi_iter.bi_size;
@ -423,29 +426,15 @@ void bch2_bio_compress(struct bch_fs *c,
/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
c->sb.encoded_extent_max << 9);
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size =
min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
/* If it's only one block, don't bother trying to compress: */
if (*compression_type != BCH_COMPRESSION_NONE &&
bio_sectors(src) > c->opts.block_size &&
!__bio_compress(c, dst, dst_len, src, src_len, compression_type))
goto out;
compression_type =
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
/* If compressing failed (didn't get smaller), just copy: */
*compression_type = BCH_COMPRESSION_NONE;
*dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
bio_copy_data(dst, src);
out:
dst->bi_iter.bi_size = orig_dst;
src->bi_iter.bi_size = orig_src;
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
BUG_ON(*dst_len & (block_bytes(c) - 1));
BUG_ON(*src_len & (block_bytes(c) - 1));
return compression_type;
}
/* doesn't write superblock: */

View File

@ -1,12 +1,14 @@
#ifndef _BCACHEFS_COMPRESS_H
#define _BCACHEFS_COMPRESS_H
#include "extents_types.h"
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
unsigned, struct bch_extent_crc128);
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
struct bvec_iter, struct bch_extent_crc128);
void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
struct bio *, size_t *, unsigned *);
struct bvec_iter, struct bch_extent_crc_unpacked);
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
struct bio *, size_t *, unsigned);
int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);

View File

@ -19,6 +19,7 @@
#include "inode.h"
#include "journal.h"
#include "super-io.h"
#include "util.h"
#include "xattr.h"
#include <trace/events/bcachefs.h>
@ -155,6 +156,44 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
unsigned ret = 0;
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_crc(e, ptr, crc)
if (!ptr->cached &&
crc.compression_type != BCH_COMPRESSION_NONE &&
crc.compressed_size < crc.live_size)
ret = max_t(unsigned, ret, crc.compressed_size);
}
return ret;
}
bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
struct bch_extent_ptr m, u64 offset)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc)
if (ptr->dev == m.dev &&
ptr->gen == m.gen &&
(s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
(s64) m.offset - offset)
return ptr;
return NULL;
}
/* Doesn't cleanup redundant crcs */
void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
{
@ -186,24 +225,30 @@ found:
bch2_extent_drop_ptr(e, ptr);
}
/* returns true if equal */
static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
struct bch_extent_crc_unpacked n)
{
return extent_crc_type(l) == extent_crc_type(r) &&
!memcmp(l, r, extent_entry_bytes(to_entry(l)));
return !u.compression_type &&
u.csum_type &&
u.uncompressed_size > u.live_size &&
bch2_csum_type_is_encryption(u.csum_type) ==
bch2_csum_type_is_encryption(n.csum_type);
}
/* Increment pointers after @crc by crc's offset until the next crc entry: */
void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
struct bch_extent_crc_unpacked n)
{
union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
if (!extent_entry_is_ptr(entry))
return;
if (!n.csum_type)
return false;
entry->ptr.offset += crc_offset(crc);
}
extent_for_each_crc(e, crc, i)
if (can_narrow_crc(crc, n))
return true;
return false;
}
/*
@ -214,96 +259,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr
* not compressed, we can modify them to point to only the data that is
* currently live (so that readers won't have to bounce) while we've got the
* checksum we need:
*
* XXX: to guard against data being corrupted while in memory, instead of
* recomputing the checksum here, it would be better in the read path to instead
* of computing the checksum of the entire extent:
*
* | extent |
*
* compute the checksums of the live and dead data separately
* | dead data || live data || dead data |
*
* and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
* use crc_live here (that we verified was correct earlier)
*
* note: doesn't work with encryption
*/
void bch2_extent_narrow_crcs(struct bkey_s_extent e)
bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
struct bch_extent_crc_unpacked n)
{
union bch_extent_crc *crc;
bool have_wide = false, have_narrow = false;
struct bch_csum csum = { 0 };
unsigned csum_type = 0;
struct bch_extent_crc_unpacked u;
struct bch_extent_ptr *ptr;
union bch_extent_entry *i;
extent_for_each_crc(e, crc) {
if (crc_compression_type(crc) ||
bch2_csum_type_is_encryption(crc_csum_type(crc)))
continue;
if (crc_uncompressed_size(e.k, crc) != e.k->size) {
have_wide = true;
} else {
have_narrow = true;
csum = crc_csum(crc);
csum_type = crc_csum_type(crc);
}
}
if (!have_wide || !have_narrow)
return;
extent_for_each_crc(e, crc) {
if (crc_compression_type(crc))
continue;
if (crc_uncompressed_size(e.k, crc) != e.k->size) {
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
BUG();
case BCH_EXTENT_CRC32:
if (bch_crc_bytes[csum_type] > 4)
continue;
bch2_extent_crc_narrow_pointers(e, crc);
crc->crc32._compressed_size = e.k->size - 1;
crc->crc32._uncompressed_size = e.k->size - 1;
crc->crc32.offset = 0;
crc->crc32.csum_type = csum_type;
crc->crc32.csum = csum.lo;
break;
case BCH_EXTENT_CRC64:
if (bch_crc_bytes[csum_type] > 10)
continue;
bch2_extent_crc_narrow_pointers(e, crc);
crc->crc64._compressed_size = e.k->size - 1;
crc->crc64._uncompressed_size = e.k->size - 1;
crc->crc64.offset = 0;
crc->crc64.csum_type = csum_type;
crc->crc64.csum_lo = csum.lo;
crc->crc64.csum_hi = csum.hi;
break;
case BCH_EXTENT_CRC128:
if (bch_crc_bytes[csum_type] > 16)
continue;
bch2_extent_crc_narrow_pointers(e, crc);
crc->crc128._compressed_size = e.k->size - 1;
crc->crc128._uncompressed_size = e.k->size - 1;
crc->crc128.offset = 0;
crc->crc128.csum_type = csum_type;
crc->crc128.csum = csum;
/* Find a checksum entry that covers only live data: */
if (!n.csum_type)
extent_for_each_crc(extent_i_to_s(e), u, i)
if (!u.compression_type &&
u.csum_type &&
u.live_size == u.uncompressed_size) {
n = u;
break;
}
if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
return false;
BUG_ON(n.compression_type);
BUG_ON(n.offset);
BUG_ON(n.live_size != e->k.size);
bch2_extent_crc_append(e, n);
restart_narrow_pointers:
extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
if (can_narrow_crc(u, n)) {
ptr->offset += u.offset;
extent_ptr_append(e, *ptr);
__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
goto restart_narrow_pointers;
}
}
bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
return true;
}
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
{
union bch_extent_entry *entry = e.v->start;
union bch_extent_crc *crc, *prev = NULL;
struct bch_extent_crc_unpacked u, prev_u;
while (entry != extent_entry_last(e)) {
union bch_extent_entry *next = extent_entry_next(entry);
@ -313,6 +312,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
goto next;
crc = entry_to_crc(entry);
u = bch2_extent_crc_unpack(e.k, crc);
if (next == extent_entry_last(e)) {
/* crc entry with no pointers after it: */
@ -324,20 +324,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
goto drop;
}
if (prev && crc_cmp(crc, prev)) {
if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
/* identical to previous crc entry: */
goto drop;
}
if (!prev &&
!crc_csum_type(crc) &&
!crc_compression_type(crc)) {
!u.csum_type &&
!u.compression_type) {
/* null crc entry: */
bch2_extent_crc_narrow_pointers(e, crc);
union bch_extent_entry *e2;
extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
if (!extent_entry_is_ptr(e2))
break;
e2->ptr.offset += u.offset;
}
goto drop;
}
prev = crc;
prev_u = u;
next:
entry = next;
continue;
@ -453,7 +461,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
{
char *out = buf, *end = buf + size;
const union bch_extent_entry *entry;
const union bch_extent_crc *crc;
struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
struct bch_dev *ca;
bool first = true;
@ -468,13 +476,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = entry_to_crc(entry);
crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
p("crc: c_size %u size %u offset %u csum %u compress %u",
crc_compressed_size(e.k, crc),
crc_uncompressed_size(e.k, crc),
crc_offset(crc), crc_csum_type(crc),
crc_compression_type(crc));
p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
crc.csum_type,
crc.compression_type);
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
@ -499,13 +508,24 @@ out:
return out - buf;
}
static inline bool dev_latency_better(struct bch_dev *dev1,
struct bch_dev *dev2)
{
unsigned l1 = atomic_read(&dev1->latency[READ]);
unsigned l2 = atomic_read(&dev2->latency[READ]);
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
}
static void extent_pick_read_device(struct bch_fs *c,
struct bkey_s_c_extent e,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *pick)
{
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
struct bch_dev *ca = c->devs[ptr->dev];
@ -516,12 +536,18 @@ static void extent_pick_read_device(struct bch_fs *c,
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
continue;
if (avoid && test_bit(ca->dev_idx, avoid->d))
if (avoid) {
if (test_bit(ca->dev_idx, avoid->d))
continue;
if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
continue;
if (pick->ca &&
test_bit(pick->ca->dev_idx, avoid->d))
goto use;
}
if (pick->ca && !dev_latency_better(ca, pick->ca))
continue;
use:
if (!percpu_ref_tryget(&ca->io_ref))
continue;
@ -530,11 +556,9 @@ static void extent_pick_read_device(struct bch_fs *c,
*pick = (struct extent_pick_ptr) {
.ptr = *ptr,
.crc = crc,
.ca = ca,
};
if (e.k->size)
pick->crc = crc_to_128(e.k, crc);
}
}
@ -557,14 +581,17 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc;
const char *reason;
extent_for_each_entry(e, entry)
extent_for_each_entry(e, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
extent_for_each_ptr_crc(e, ptr, crc) {
if (extent_entry_is_crc(entry))
return "has crc field";
}
extent_for_each_ptr(e, ptr) {
reason = extent_ptr_invalid(c, e, ptr,
c->opts.btree_node_size,
true);
@ -572,9 +599,6 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
return reason;
}
if (crc)
return "has crc field";
return NULL;
}
@ -699,28 +723,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
__set_bkey_deleted(k.k);
else if (bkey_extent_is_data(k.k)) {
struct bkey_s_extent e = bkey_s_to_extent(k);
struct bch_extent_ptr *ptr;
union bch_extent_crc *crc, *prev_crc = NULL;
union bch_extent_entry *entry;
bool seen_crc = false;
extent_for_each_ptr_crc(e, ptr, crc) {
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
ptr->offset += e.k->size - len;
extent_for_each_entry(e, entry) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
if (!seen_crc)
entry->ptr.offset += e.k->size - len;
break;
case BCH_EXTENT_CRC32:
if (prev_crc != crc)
crc->crc32.offset += e.k->size - len;
case BCH_EXTENT_ENTRY_crc32:
entry->crc32.offset += e.k->size - len;
break;
case BCH_EXTENT_CRC64:
if (prev_crc != crc)
crc->crc64.offset += e.k->size - len;
case BCH_EXTENT_ENTRY_crc64:
entry->crc64.offset += e.k->size - len;
break;
case BCH_EXTENT_CRC128:
if (prev_crc != crc)
crc->crc128.offset += e.k->size - len;
case BCH_EXTENT_ENTRY_crc128:
entry->crc128.offset += e.k->size - len;
break;
}
prev_crc = crc;
if (extent_entry_is_crc(entry))
seen_crc = true;
}
}
@ -989,7 +1013,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
return;
bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
&s->stats, s->trans->journal_res.seq);
&s->stats, s->trans->journal_res.seq, 0);
}
static void bch2_subtract_sectors(struct extent_insert_state *s,
@ -1123,7 +1147,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
bkey_cmp(s->committed, insert->k.p) &&
bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
/* XXX: possibly need to increase our reservation? */
bch2_cut_subtract_back(s, s->committed,
bkey_i_to_s(&split.k));
@ -1152,46 +1176,24 @@ done:
s->trans->did_work = true;
}
static enum extent_insert_hook_ret
static enum btree_insert_ret
__extent_insert_advance_pos(struct extent_insert_state *s,
struct bpos next_pos,
struct bkey_s_c k)
{
struct extent_insert_hook *hook = s->trans->hook;
enum extent_insert_hook_ret ret;
#if 0
/*
* Currently disabled for encryption - broken with fcollapse. Will have
* to reenable when versions are exposed for send/receive - versions
* will have to be monotonic then:
*/
if (k.k && k.k->size &&
!bversion_zero(s->insert->k->k.version) &&
bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
ret = BTREE_HOOK_NO_INSERT;
} else
#endif
enum btree_insert_ret ret;
if (hook)
ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
else
ret = BTREE_HOOK_DO_INSERT;
ret = BTREE_INSERT_OK;
EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
switch (ret) {
case BTREE_HOOK_DO_INSERT:
break;
case BTREE_HOOK_NO_INSERT:
extent_insert_committed(s);
bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
break;
case BTREE_HOOK_RESTART_TRANS:
return ret;
}
if (ret == BTREE_INSERT_OK)
s->committed = next_pos;
return ret;
}
@ -1199,39 +1201,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
* Update iter->pos, marking how much of @insert we've processed, and call hook
* fn:
*/
static enum extent_insert_hook_ret
static enum btree_insert_ret
extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
{
struct btree *b = s->insert->iter->nodes[0];
struct bpos next_pos = bpos_min(s->insert->k->k.p,
k.k ? k.k->p : b->key.k.p);
enum btree_insert_ret ret;
if (race_fault())
return BTREE_INSERT_NEED_TRAVERSE;
/* hole? */
if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
bool have_uncommitted = bkey_cmp(s->committed,
bkey_start_pos(&s->insert->k->k)) > 0;
switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
bkey_s_c_null)) {
case BTREE_HOOK_DO_INSERT:
break;
case BTREE_HOOK_NO_INSERT:
/*
* we had to split @insert and insert the committed
* part - need to bail out and recheck journal
* reservation/btree node before we advance pos past @k:
*/
if (have_uncommitted)
return BTREE_HOOK_NO_INSERT;
break;
case BTREE_HOOK_RESTART_TRANS:
return BTREE_HOOK_RESTART_TRANS;
}
ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
bkey_s_c_null);
if (ret != BTREE_INSERT_OK)
return ret;
}
/* avoid redundant calls to hook fn: */
if (!bkey_cmp(s->committed, next_pos))
return BTREE_HOOK_DO_INSERT;
return BTREE_INSERT_OK;
return __extent_insert_advance_pos(s, next_pos, k);
}
@ -1245,7 +1236,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
unsigned sectors;
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bkey_extent_is_compressed(k))) {
(sectors = bch2_extent_is_compressed(k))) {
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
if (s->trans->flags & BTREE_INSERT_NOFAIL)
@ -1277,6 +1268,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
struct btree_iter *iter = s->insert->iter;
struct btree *b = iter->nodes[0];
struct btree_node_iter *node_iter = &iter->node_iters[0];
enum btree_insert_ret ret;
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
@ -1322,9 +1314,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
k.k->p = orig_pos;
extent_save(b, node_iter, _k, k.k);
if (extent_insert_advance_pos(s, k.s_c) ==
BTREE_HOOK_RESTART_TRANS)
return BTREE_INSERT_NEED_TRAVERSE;
ret = extent_insert_advance_pos(s, k.s_c);
if (ret != BTREE_INSERT_OK)
return ret;
extent_insert_committed(s);
/*
@ -1420,15 +1412,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
if (ret != BTREE_INSERT_OK)
goto stop;
switch (extent_insert_advance_pos(s, k.s_c)) {
case BTREE_HOOK_DO_INSERT:
break;
case BTREE_HOOK_NO_INSERT:
continue;
case BTREE_HOOK_RESTART_TRANS:
ret = BTREE_INSERT_NEED_TRAVERSE;
ret = extent_insert_advance_pos(s, k.s_c);
if (ret)
goto stop;
}
s->do_journal = true;
@ -1469,10 +1455,9 @@ next:
bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
}
if (bkey_cmp(s->committed, insert->k.p) < 0 &&
ret == BTREE_INSERT_OK &&
extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
ret = BTREE_INSERT_NEED_TRAVERSE;
if (ret == BTREE_INSERT_OK &&
bkey_cmp(s->committed, insert->k.p) < 0)
ret = extent_insert_advance_pos(s, bkey_s_c_null);
stop:
extent_insert_committed(s);
@ -1594,18 +1579,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
/*
* Only call advance pos & call hook for nonzero size extents:
* If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
* overlaps with @k:
*/
switch (extent_insert_advance_pos(&s, k.s_c)) {
case BTREE_HOOK_DO_INSERT:
break;
case BTREE_HOOK_NO_INSERT:
continue;
case BTREE_HOOK_RESTART_TRANS:
ret = BTREE_INSERT_NEED_TRAVERSE;
ret = extent_insert_advance_pos(&s, k.s_c);
if (ret != BTREE_INSERT_OK)
goto stop;
}
if (k.k->size &&
(k.k->needs_whiteout || bset_written(b, bset(b, t))))
@ -1623,10 +1600,9 @@ squash:
goto stop;
}
if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
ret == BTREE_INSERT_OK &&
extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
ret = BTREE_INSERT_NEED_TRAVERSE;
if (ret == BTREE_INSERT_OK &&
bkey_cmp(s.committed, insert->k->k.p) < 0)
ret = extent_insert_advance_pos(&s, bkey_s_c_null);
stop:
extent_insert_committed(&s);
/*
@ -1669,29 +1645,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c,
case BCH_EXTENT_CACHED: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
const union bch_extent_crc *crc;
struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
unsigned size_ondisk = e.k->size;
const char *reason;
unsigned nonce = UINT_MAX;
extent_for_each_entry(e, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
if (extent_entry_is_crc(entry)) {
crc = entry_to_crc(entry);
crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
if (crc_offset(crc) + e.k->size >
crc_uncompressed_size(e.k, crc))
if (crc.offset + e.k->size >
crc.uncompressed_size)
return "checksum offset + key size > uncompressed size";
size_ondisk = crc_compressed_size(e.k, crc);
size_ondisk = crc.compressed_size;
if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
if (!bch2_checksum_type_valid(c, crc.csum_type))
return "invalid checksum type";
if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
if (crc.compression_type >= BCH_COMPRESSION_NR)
return "invalid compression type";
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
else if (nonce != crc.offset + crc.nonce)
return "incorrect nonce";
}
} else {
ptr = entry_to_ptr(entry);
@ -1864,102 +1848,75 @@ static unsigned PTR_TIER(struct bch_fs *c,
}
static void bch2_extent_crc_init(union bch_extent_crc *crc,
unsigned compressed_size,
unsigned uncompressed_size,
unsigned compression_type,
unsigned nonce,
struct bch_csum csum, unsigned csum_type)
struct bch_extent_crc_unpacked new)
{
if (bch_crc_bytes[csum_type] <= 4 &&
uncompressed_size <= CRC32_SIZE_MAX &&
nonce <= CRC32_NONCE_MAX) {
#define common_fields(_crc) \
.csum_type = _crc.csum_type, \
.compression_type = _crc.compression_type, \
._compressed_size = _crc.compressed_size - 1, \
._uncompressed_size = _crc.uncompressed_size - 1, \
.offset = _crc.offset
if (bch_crc_bytes[new.csum_type] <= 4 &&
new.uncompressed_size <= CRC32_SIZE_MAX &&
new.nonce <= CRC32_NONCE_MAX) {
crc->crc32 = (struct bch_extent_crc32) {
.type = 1 << BCH_EXTENT_ENTRY_crc32,
._compressed_size = compressed_size - 1,
._uncompressed_size = uncompressed_size - 1,
.offset = 0,
.compression_type = compression_type,
.csum_type = csum_type,
.csum = *((__le32 *) &csum.lo),
common_fields(new),
.csum = *((__le32 *) &new.csum.lo),
};
return;
}
if (bch_crc_bytes[csum_type] <= 10 &&
uncompressed_size <= CRC64_SIZE_MAX &&
nonce <= CRC64_NONCE_MAX) {
if (bch_crc_bytes[new.csum_type] <= 10 &&
new.uncompressed_size <= CRC64_SIZE_MAX &&
new.nonce <= CRC64_NONCE_MAX) {
crc->crc64 = (struct bch_extent_crc64) {
.type = 1 << BCH_EXTENT_ENTRY_crc64,
._compressed_size = compressed_size - 1,
._uncompressed_size = uncompressed_size - 1,
.offset = 0,
.nonce = nonce,
.compression_type = compression_type,
.csum_type = csum_type,
.csum_lo = csum.lo,
.csum_hi = *((__le16 *) &csum.hi),
common_fields(new),
.nonce = new.nonce,
.csum_lo = new.csum.lo,
.csum_hi = *((__le16 *) &new.csum.hi),
};
return;
}
if (bch_crc_bytes[csum_type] <= 16 &&
uncompressed_size <= CRC128_SIZE_MAX &&
nonce <= CRC128_NONCE_MAX) {
if (bch_crc_bytes[new.csum_type] <= 16 &&
new.uncompressed_size <= CRC128_SIZE_MAX &&
new.nonce <= CRC128_NONCE_MAX) {
crc->crc128 = (struct bch_extent_crc128) {
.type = 1 << BCH_EXTENT_ENTRY_crc128,
._compressed_size = compressed_size - 1,
._uncompressed_size = uncompressed_size - 1,
.offset = 0,
.nonce = nonce,
.compression_type = compression_type,
.csum_type = csum_type,
.csum = csum,
common_fields(new),
.nonce = new.nonce,
.csum = new.csum,
};
return;
}
#undef common_fields
BUG();
}
void bch2_extent_crc_append(struct bkey_i_extent *e,
unsigned compressed_size,
unsigned uncompressed_size,
unsigned compression_type,
unsigned nonce,
struct bch_csum csum, unsigned csum_type)
struct bch_extent_crc_unpacked new)
{
union bch_extent_crc *crc;
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
BUG_ON(compressed_size > uncompressed_size);
BUG_ON(uncompressed_size != e->k.size);
BUG_ON(!compressed_size || !uncompressed_size);
BUG_ON(new.compressed_size > new.uncompressed_size);
BUG_ON(new.live_size != e->k.size);
BUG_ON(!new.compressed_size || !new.uncompressed_size);
/*
* Look up the last crc entry, so we can check if we need to add
* another:
*/
extent_for_each_crc(extent_i_to_s(e), crc)
extent_for_each_crc(extent_i_to_s(e), crc, i)
;
if (!crc && !csum_type && !compression_type)
if (!memcmp(&crc, &new, sizeof(crc)))
return;
if (crc &&
crc_compressed_size(&e->k, crc) == compressed_size &&
crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
crc_offset(crc) == 0 &&
crc_nonce(crc) == nonce &&
crc_csum_type(crc) == csum_type &&
crc_compression_type(crc) == compression_type &&
crc_csum(crc).lo == csum.lo &&
crc_csum(crc).hi == csum.hi)
return;
bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
compressed_size,
uncompressed_size,
compression_type,
nonce, csum, csum_type);
bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
__extent_entry_push(e);
}
@ -2011,16 +1968,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
}
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e,
unsigned nr_cached)
struct bkey_s_extent e)
{
struct bch_extent_ptr *ptr;
unsigned tier = 0, nr_cached = 0, nr_good = 0;
bool have_higher_tier;
unsigned tier = 0;
if (!nr_cached)
extent_for_each_ptr(e, ptr)
if (!ptr->cached &&
c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
nr_good++;
if (nr_good <= c->opts.data_replicas)
return;
nr_cached = nr_good - c->opts.data_replicas;
do {
have_higher_tier = false;

View File

@ -3,7 +3,7 @@
#include "bcachefs.h"
#include "bkey.h"
#include "io_types.h"
#include "extents_types.h"
struct bch_fs;
struct journal_res;
@ -38,11 +38,17 @@ bch2_insert_fixup_extent(struct btree_insert *,
struct btree_insert_entry *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_mark_replicas_cached(struct bch_fs *,
struct bkey_s_extent, unsigned);
void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
struct bch_extent_ptr, u64);
static inline bool bkey_extent_is_data(const struct bkey *k)
{
@ -67,6 +73,12 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
}
}
static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
{
return bkey_extent_is_allocation(k.k) &&
!bch2_extent_is_compressed(k);
}
static inline bool bkey_extent_is_cached(const struct bkey *k)
{
return k->type == BCH_EXTENT_CACHED;
@ -170,6 +182,8 @@ union bch_extent_crc {
(struct bch_extent_ptr *) (_entry)); \
})
/* checksum entries: */
enum bch_extent_crc_type {
BCH_EXTENT_CRC_NONE,
BCH_EXTENT_CRC32,
@ -208,6 +222,50 @@ __extent_crc_type(const union bch_extent_crc *crc)
: __extent_crc_type((union bch_extent_crc *) _crc); \
})
static inline struct bch_extent_crc_unpacked
bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
{
#define common_fields(_crc) \
.csum_type = _crc.csum_type, \
.compression_type = _crc.compression_type, \
.compressed_size = _crc._compressed_size + 1, \
.uncompressed_size = _crc._uncompressed_size + 1, \
.offset = _crc.offset, \
.live_size = k->size
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return (struct bch_extent_crc_unpacked) {
.compressed_size = k->size,
.uncompressed_size = k->size,
.live_size = k->size,
};
case BCH_EXTENT_CRC32:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
.csum.lo = crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
.csum.lo = crc->crc64.csum_lo,
.csum.hi = crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc128),
.nonce = crc->crc128.nonce,
.csum = crc->crc128.csum,
};
default:
BUG();
}
#undef common_fields
}
/* Extent entry iteration: */
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
@ -226,7 +284,7 @@ __extent_crc_type(const union bch_extent_crc *crc)
/* Iterate over crcs only: */
#define extent_crc_next(_e, _p) \
#define __extent_crc_next(_e, _p) \
({ \
typeof(&(_e).v->start[0]) _entry = _p; \
\
@ -237,24 +295,40 @@ __extent_crc_type(const union bch_extent_crc *crc)
entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \
})
#define extent_for_each_crc(_e, _crc) \
for ((_crc) = extent_crc_next(_e, (_e).v->start); \
#define __extent_for_each_crc(_e, _crc) \
for ((_crc) = __extent_crc_next(_e, (_e).v->start); \
(_crc); \
(_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
(_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
#define extent_crc_next(_e, _crc, _iter) \
({ \
extent_for_each_entry_from(_e, _iter, _iter) \
if (extent_entry_is_crc(_iter)) { \
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
break; \
} \
\
(_iter) < extent_entry_last(_e); \
})
#define extent_for_each_crc(_e, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
(_iter) = (_e).v->start; \
extent_crc_next(_e, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
/* Iterate over pointers, with crcs: */
#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \
#define extent_ptr_crc_next(_e, _ptr, _crc) \
({ \
__label__ out; \
typeof(&(_e).v->start[0]) _entry; \
\
extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \
if (extent_entry_is_crc(_entry)) { \
(_crc) = entry_to_crc(_entry); \
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
} else { \
_ptr = entry_to_ptr(_entry); \
if (_filter) \
goto out; \
} \
\
@ -263,34 +337,25 @@ out: \
_ptr; \
})
#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \
for ((_crc) = NULL, \
(_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
(_ptr)++)
#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
(_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \
(_ptr)++)
/* Iterate over pointers only, and from a given position: */
#define extent_ptr_next_filter(_e, _ptr, _filter) \
#define extent_ptr_next(_e, _ptr) \
({ \
typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \
struct bch_extent_crc_unpacked _crc; \
\
extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \
extent_ptr_crc_next(_e, _ptr, _crc); \
})
#define extent_ptr_next(_e, _ptr) \
extent_ptr_next_filter(_e, _ptr, true)
#define extent_for_each_ptr_filter(_e, _ptr, _filter) \
for ((_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \
(_ptr)++)
#define extent_for_each_ptr(_e, _ptr) \
extent_for_each_ptr_filter(_e, _ptr, true)
for ((_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_next(_e, _ptr)); \
(_ptr)++)
#define extent_ptr_prev(_e, _ptr) \
({ \
@ -315,8 +380,8 @@ out: \
(_ptr); \
(_ptr) = extent_ptr_prev(_e, _ptr))
void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
unsigned, unsigned, struct bch_csum, unsigned);
void bch2_extent_crc_append(struct bkey_i_extent *,
struct bch_extent_crc_unpacked);
static inline void __extent_entry_push(struct bkey_i_extent *e)
{
@ -336,226 +401,26 @@ static inline void extent_ptr_append(struct bkey_i_extent *e,
__extent_entry_push(e);
}
static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
const union bch_extent_crc *crc)
static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
{
EBUG_ON(!k->size);
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return (struct bch_extent_crc128) {
._compressed_size = k->size - 1,
._uncompressed_size = k->size - 1,
};
case BCH_EXTENT_CRC32:
return (struct bch_extent_crc128) {
.type = 1 << BCH_EXTENT_ENTRY_crc128,
._compressed_size = crc->crc32._compressed_size,
._uncompressed_size = crc->crc32._uncompressed_size,
.offset = crc->crc32.offset,
.csum_type = crc->crc32.csum_type,
.compression_type = crc->crc32.compression_type,
.csum.lo = crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
return (struct bch_extent_crc128) {
.type = 1 << BCH_EXTENT_ENTRY_crc128,
._compressed_size = crc->crc64._compressed_size,
._uncompressed_size = crc->crc64._uncompressed_size,
.offset = crc->crc64.offset,
.nonce = crc->crc64.nonce,
.csum_type = crc->crc64.csum_type,
.compression_type = crc->crc64.compression_type,
.csum.lo = crc->crc64.csum_lo,
.csum.hi = crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return crc->crc128;
default:
BUG();
}
}
#define crc_compressed_size(_k, _crc) \
({ \
unsigned _size = 0; \
\
switch (extent_crc_type(_crc)) { \
case BCH_EXTENT_CRC_NONE: \
_size = ((const struct bkey *) (_k))->size; \
break; \
case BCH_EXTENT_CRC32: \
_size = ((struct bch_extent_crc32 *) _crc) \
->_compressed_size + 1; \
break; \
case BCH_EXTENT_CRC64: \
_size = ((struct bch_extent_crc64 *) _crc) \
->_compressed_size + 1; \
break; \
case BCH_EXTENT_CRC128: \
_size = ((struct bch_extent_crc128 *) _crc) \
->_compressed_size + 1; \
break; \
} \
_size; \
})
#define crc_uncompressed_size(_k, _crc) \
({ \
unsigned _size = 0; \
\
switch (extent_crc_type(_crc)) { \
case BCH_EXTENT_CRC_NONE: \
_size = ((const struct bkey *) (_k))->size; \
break; \
case BCH_EXTENT_CRC32: \
_size = ((struct bch_extent_crc32 *) _crc) \
->_uncompressed_size + 1; \
break; \
case BCH_EXTENT_CRC64: \
_size = ((struct bch_extent_crc64 *) _crc) \
->_uncompressed_size + 1; \
break; \
case BCH_EXTENT_CRC128: \
_size = ((struct bch_extent_crc128 *) _crc) \
->_uncompressed_size + 1; \
break; \
} \
_size; \
})
static inline unsigned crc_offset(const union bch_extent_crc *crc)
{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return 0;
case BCH_EXTENT_CRC32:
return crc->crc32.offset;
case BCH_EXTENT_CRC64:
return crc->crc64.offset;
case BCH_EXTENT_CRC128:
return crc->crc128.offset;
default:
BUG();
}
}
static inline unsigned crc_nonce(const union bch_extent_crc *crc)
{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
case BCH_EXTENT_CRC32:
return 0;
case BCH_EXTENT_CRC64:
return crc->crc64.nonce;
case BCH_EXTENT_CRC128:
return crc->crc128.nonce;
default:
BUG();
}
}
static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return 0;
case BCH_EXTENT_CRC32:
return crc->crc32.csum_type;
case BCH_EXTENT_CRC64:
return crc->crc64.csum_type;
case BCH_EXTENT_CRC128:
return crc->crc128.csum_type;
default:
BUG();
}
}
static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return 0;
case BCH_EXTENT_CRC32:
return crc->crc32.compression_type;
case BCH_EXTENT_CRC64:
return crc->crc64.compression_type;
case BCH_EXTENT_CRC128:
return crc->crc128.compression_type;
default:
BUG();
}
}
static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return (struct bch_csum) { 0 };
case BCH_EXTENT_CRC32:
return (struct bch_csum) { .lo = crc->crc32.csum };
case BCH_EXTENT_CRC64:
return (struct bch_csum) {
.lo = crc->crc64.csum_lo,
.hi = crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return crc->crc128.csum;
default:
BUG();
}
}
static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc;
unsigned ret = 0;
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_crc(e, ptr, crc)
if (!ptr->cached &&
crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
crc_compressed_size(e.k, crc) < k.k->size)
ret = max_t(unsigned, ret,
crc_compressed_size(e.k, crc));
}
extent_for_each_ptr(e, ptr)
ret.devs[ret.nr++] = ptr->dev;
return ret;
}
static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
{
const union bch_extent_crc *crc;
extent_for_each_crc(e, crc)
if (bch2_csum_type_is_encryption(crc_csum_type(crc)))
return crc_offset(crc) + crc_nonce(crc);
return 0;
}
void bch2_extent_narrow_crcs(struct bkey_s_extent);
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
struct bch_extent_crc_unpacked);
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
struct bch_extent_ptr *
bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
struct bch_extent_ptr);
struct bch_extent_ptr *
bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
struct bkey_s_c_extent);
bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned);

View File

@ -0,0 +1,27 @@
#ifndef _BCACHEFS_EXTENTS_TYPES_H
#define _BCACHEFS_EXTENTS_TYPES_H
#include "bcachefs_format.h"
struct bch_extent_crc_unpacked {
u8 csum_type;
u8 compression_type;
u16 compressed_size;
u16 uncompressed_size;
u16 offset;
u16 live_size;
u16 nonce;
struct bch_csum csum;
};
struct extent_pick_ptr {
struct bch_extent_ptr ptr;
struct bch_extent_crc_unpacked crc;
struct bch_dev *ca;
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */

View File

@ -80,7 +80,7 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
EBUG_ON(i >= size);
if (eytzinger1_left_child(i) < size) {
i = eytzinger1_left_child(i);
i = eytzinger1_left_child(i) + 1;
i <<= __fls(size) - __fls(i);
i -= 1;
@ -163,38 +163,6 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
#if 0
void eytzinger0_test(void)
{
unsigned i, j, size;
for (size = 2;
size < 65536000;
size++) {
if (!(size % 4096))
printk(KERN_INFO "tree size %u\n", size);
assert(eytzinger1_prev(0, size) == eytzinger1_last(size));
assert(eytzinger1_next(0, size) == eytzinger1_first(size));
assert(eytzinger1_prev(eytzinger1_first(size), size) == 0);
assert(eytzinger1_next(eytzinger1_last(size), size) == 0);
eytzinger1_for_each(j, size) {
assert(from_inorder(i, size) == j);
assert(to_inorder(j, size) == i);
if (j != eytzinger1_last(size)) {
unsigned next = eytzinger1_next(j, size);
assert(eytzinger1_prev(next, size) == j);
}
}
}
}
#endif
/* Zero based indexing version: */
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
@ -214,27 +182,29 @@ static inline unsigned eytzinger0_right_child(unsigned i)
return eytzinger0_child(i, 1);
}
#if 0
static inline unsigned eytzinger0_first(unsigned size)
{
return eytzinger1_first(size + 1) - 1;
}
static inline unsigned eytzinger0_last(unsigned size)
{
return eytzinger1_last(size + 1) - 1;
}
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
{
return eytzinger1_next(i + 1, size + 1) - 1;
}
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
{
return eytzinger1_prev(i + 1, size + 1) - 1;
}
#endif
static inline unsigned eytzinger0_extra(unsigned size)
{
return (size + 1 - rounddown_pow_of_two(size)) << 1;
return eytzinger1_extra(size + 1);
}
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
@ -259,10 +229,41 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
}
#define eytzinger0_for_each(_i, _size) \
for ((_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, const void *search)
{
unsigned i, n = 0;
if (!nr)
return -1;
do {
i = n;
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
} while (n < nr);
if (n & 1) {
/* @i was greater than @search, return previous node: */
if (i == eytzinger0_first(nr))
return -1;
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, void *search)
eytzinger_cmp_fn cmp, const void *search)
{
size_t i = 0;
int res;
@ -271,17 +272,6 @@ static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
(res = cmp(search, base + i * size, size)))
i = eytzinger0_child(i, res > 0);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
bool found1 = i < nr, found2 = false;
size_t j;
for (j = 0; j < nr; j++)
if (!cmp(base + j * size, search, size))
found2 = true;
BUG_ON(found1 != found2);
}
return i;
}

View File

@ -26,9 +26,67 @@
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
struct bio_set *bch2_writepage_bioset;
struct bio_set *bch2_dio_read_bioset;
struct bio_set *bch2_dio_write_bioset;
struct i_sectors_hook {
struct extent_insert_hook hook;
s64 sectors;
struct bch_inode_info *inode;
};
struct bchfs_write_op {
struct bch_inode_info *inode;
s64 sectors_added;
bool is_dio;
bool unalloc;
u64 new_i_size;
/* must be last: */
struct bch_write_op op;
};
static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
struct bch_inode_info *inode,
bool is_dio)
{
op->inode = inode;
op->sectors_added = 0;
op->is_dio = is_dio;
op->unalloc = false;
op->new_i_size = U64_MAX;
}
struct bch_writepage_io {
struct closure cl;
/* must be last: */
struct bchfs_write_op op;
};
struct dio_write {
struct closure cl;
struct kiocb *req;
struct bch_fs *c;
long written;
long error;
loff_t offset;
struct disk_reservation res;
struct iovec *iovec;
struct iovec inline_vecs[UIO_FASTIOV];
struct iov_iter iter;
struct task_struct *task;
/* must be last: */
struct bchfs_write_op iop;
};
struct dio_read {
struct closure cl;
struct kiocb *req;
long ret;
struct bch_read_bio rbio;
};
/* pagecache_block must be held */
static int write_invalidate_inode_pages_range(struct address_space *mapping,
@ -101,7 +159,7 @@ static inline void i_size_dirty_get(struct bch_inode_info *inode)
/* i_sectors accounting: */
static enum extent_insert_hook_ret
static enum btree_insert_ret
i_sectors_hook_fn(struct extent_insert_hook *hook,
struct bpos committed_pos,
struct bpos next_pos,
@ -119,7 +177,7 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
h->sectors += sectors * sign;
return BTREE_HOOK_DO_INSERT;
return BTREE_INSERT_OK;
}
static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
@ -208,7 +266,7 @@ struct bchfs_extent_trans_hook {
bool need_inode_update;
};
static enum extent_insert_hook_ret
static enum btree_insert_ret
bchfs_extent_update_hook(struct extent_insert_hook *hook,
struct bpos committed_pos,
struct bpos next_pos,
@ -224,6 +282,10 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
bool do_pack = false;
if (h->op->unalloc &&
!bch2_extent_is_fully_allocated(k))
return BTREE_INSERT_ENOSPC;
BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
/* XXX: inode->i_size locking */
@ -232,7 +294,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
if (!h->need_inode_update) {
h->need_inode_update = true;
return BTREE_HOOK_RESTART_TRANS;
return BTREE_INSERT_NEED_TRAVERSE;
}
h->inode_u.bi_size = offset;
@ -247,7 +309,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
if (sectors) {
if (!h->need_inode_update) {
h->need_inode_update = true;
return BTREE_HOOK_RESTART_TRANS;
return BTREE_INSERT_NEED_TRAVERSE;
}
h->inode_u.bi_sectors += sectors;
@ -267,7 +329,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
if (do_pack)
bch2_inode_pack(&h->inode_p, &h->inode_u);
return BTREE_HOOK_DO_INSERT;
return BTREE_INSERT_OK;
}
static int bchfs_write_index_update(struct bch_write_op *wop)
@ -352,12 +414,16 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(&extent_iter, k));
}
BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
BUG_ON(!ret != !k->k.size);
err:
if (ret == -EINTR)
continue;
if (ret)
break;
BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
bch2_keylist_pop_front(keys);
} while (!bch2_keylist_empty(keys));
@ -748,8 +814,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(bio, k);
if (!bkey_extent_is_allocation(k.k) ||
bkey_extent_is_compressed(k))
if (!bch2_extent_is_fully_allocated(k))
bch2_mark_pages_unalloc(bio);
if (pick.ca) {
@ -759,7 +824,8 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
trace_read_split(&rbio->bio);
}
bch2_read_extent(c, rbio, k, &pick, flags);
bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
&pick, flags);
} else {
zero_fill_bio(bio);
@ -963,22 +1029,20 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
alloc_io:
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
BIO_MAX_PAGES,
bch2_writepage_bioset),
&c->writepage_bioset),
struct bch_writepage_io, op.op.wbio.bio);
closure_init(&w->io->cl, NULL);
w->io->op.inode = inode;
w->io->op.sectors_added = 0;
w->io->op.is_dio = false;
bch2_fswrite_op_init(&w->io->op, inode, false);
bch2_write_op_init(&w->io->op.op, c,
(struct disk_reservation) {
.nr_replicas = c->opts.data_replicas,
},
c->fastest_devs,
inode->ei_last_dirtied,
writepoint_hashed(inode->ei_last_dirtied),
POS(inum, 0),
&inode->ei_journal_seq,
BCH_WRITE_THROTTLE);
0);
w->io->op.op.index_update_fn = bchfs_write_index_update;
}
@ -1409,7 +1473,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
bch2_dio_read_bioset);
&c->dio_read_bioset);
bio->bi_end_io = bch2_direct_IO_read_endio;
@ -1541,20 +1605,19 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
return;
}
dio->iop.inode = inode;
dio->iop.sectors_added = 0;
dio->iop.is_dio = true;
dio->iop.new_i_size = U64_MAX;
bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
dio->c->fastest_devs,
(unsigned long) dio->task,
writepoint_hashed((unsigned long) dio->task),
POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
&inode->ei_journal_seq,
flags|BCH_WRITE_THROTTLE);
flags);
dio->iop.op.index_update_fn = bchfs_write_index_update;
if (!dio->iop.unalloc) {
dio->res.sectors -= bio_sectors(bio);
dio->iop.op.res.sectors = bio_sectors(bio);
}
task_io_account_write(bio->bi_iter.bi_size);
@ -1589,6 +1652,31 @@ static void bch2_dio_write_loop_async(struct closure *cl)
}
}
static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
u64 size)
{
struct btree_iter iter;
struct bpos end = pos;
struct bkey_s_c k;
int ret = 0;
end.offset += size;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
BTREE_ITER_WITH_HOLES, k) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
if (!bch2_extent_is_fully_allocated(k)) {
ret = -ENOSPC;
break;
}
}
bch2_btree_iter_unlock(&iter);
return ret;
}
static int bch2_direct_IO_write(struct bch_fs *c,
struct kiocb *req, struct file *file,
struct bch_inode_info *inode,
@ -1610,8 +1698,9 @@ static int bch2_direct_IO_write(struct bch_fs *c,
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
bch2_dio_write_bioset);
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
closure_init(&dio->cl, NULL);
dio->req = req;
dio->c = c;
dio->written = 0;
@ -1620,7 +1709,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
dio->iovec = NULL;
dio->iter = *iter;
dio->task = current;
closure_init(&dio->cl, NULL);
bch2_fswrite_op_init(&dio->iop, inode, true);
if (offset + iter->count > inode->v.i_size)
sync = true;
@ -1635,11 +1724,17 @@ static int bch2_direct_IO_write(struct bch_fs *c,
*/
ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9),
iter->count >> 9)) {
closure_debug_destroy(&dio->cl);
bio_put(bio);
return ret;
}
dio->iop.unalloc = true;
}
inode_dio_begin(&inode->v);
__pagecache_block_get(&mapping->add_lock);
@ -2318,7 +2413,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
if (reservation.v.nr_replicas < replicas ||
bkey_extent_is_compressed(k)) {
bch2_extent_is_compressed(k)) {
ret = bch2_disk_reservation_get(c, &disk_res,
sectors, 0);
if (ret)
@ -2564,4 +2659,24 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
void bch2_fs_fsio_exit(struct bch_fs *c)
{
bioset_exit(&c->dio_write_bioset);
bioset_exit(&c->dio_read_bioset);
bioset_exit(&c->writepage_bioset);
}
int bch2_fs_fsio_init(struct bch_fs *c)
{
if (bioset_init(&c->writepage_bioset,
4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
bioset_init(&c->dio_read_bioset,
4, offsetof(struct dio_read, rbio.bio)) ||
bioset_init(&c->dio_write_bioset,
4, offsetof(struct dio_write, iop.op.wbio.bio)))
return -ENOMEM;
return 0;
}
#endif /* NO_BCACHEFS_FS */

View File

@ -1,7 +1,11 @@
#ifndef _BCACHEFS_FS_IO_H
#define _BCACHEFS_FS_IO_H
#ifndef NO_BCACHEFS_FS
#include "buckets.h"
#include "io_types.h"
#include <linux/uio.h>
int bch2_set_page_dirty(struct page *);
@ -35,60 +39,11 @@ int bch2_releasepage(struct page *, gfp_t);
int bch2_migrate_page(struct address_space *, struct page *,
struct page *, enum migrate_mode);
struct i_sectors_hook {
struct extent_insert_hook hook;
s64 sectors;
struct bch_inode_info *inode;
};
struct bchfs_write_op {
struct bch_inode_info *inode;
s64 sectors_added;
bool is_dio;
u64 new_i_size;
/* must be last: */
struct bch_write_op op;
};
struct bch_writepage_io {
struct closure cl;
/* must be last: */
struct bchfs_write_op op;
};
extern struct bio_set *bch2_writepage_bioset;
struct dio_write {
struct closure cl;
struct kiocb *req;
struct bch_fs *c;
long written;
long error;
loff_t offset;
struct disk_reservation res;
struct iovec *iovec;
struct iovec inline_vecs[UIO_FASTIOV];
struct iov_iter iter;
struct task_struct *task;
/* must be last: */
struct bchfs_write_op iop;
};
extern struct bio_set *bch2_dio_write_bioset;
struct dio_read {
struct closure cl;
struct kiocb *req;
long ret;
struct bch_read_bio rbio;
};
extern struct bio_set *bch2_dio_read_bioset;
void bch2_fs_fsio_exit(struct bch_fs *);
int bch2_fs_fsio_init(struct bch_fs *);
#else
static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
#endif
#endif /* _BCACHEFS_FS_IO_H */

View File

@ -654,17 +654,17 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
if (bkey_extent_is_data(&k->k)) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc;
struct bch_extent_crc_unpacked crc;
int ret;
extent_for_each_ptr_crc(e, ptr, crc) {
int flags2 = 0;
u64 offset = ptr->offset;
if (crc_compression_type(crc))
if (crc.compression_type)
flags2 |= FIEMAP_EXTENT_ENCODED;
else
offset += crc_offset(crc);
offset += crc.offset;
if ((offset & (PAGE_SECTORS - 1)) ||
(e.k->size & (PAGE_SECTORS - 1)))
@ -1336,12 +1336,6 @@ MODULE_ALIAS_FS("bcachefs");
void bch2_vfs_exit(void)
{
unregister_filesystem(&bcache_fs_type);
if (bch2_dio_write_bioset)
bioset_free(bch2_dio_write_bioset);
if (bch2_dio_read_bioset)
bioset_free(bch2_dio_read_bioset);
if (bch2_writepage_bioset)
bioset_free(bch2_writepage_bioset);
if (bch2_inode_cache)
kmem_cache_destroy(bch2_inode_cache);
}
@ -1354,20 +1348,6 @@ int __init bch2_vfs_init(void)
if (!bch2_inode_cache)
goto err;
bch2_writepage_bioset =
bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
if (!bch2_writepage_bioset)
goto err;
bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
if (!bch2_dio_read_bioset)
goto err;
bch2_dio_write_bioset =
bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
if (!bch2_dio_write_bioset)
goto err;
ret = register_filesystem(&bcache_fs_type);
if (ret)
goto err;

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,8 @@
#define _BCACHEFS_IO_H
#include <linux/hash.h>
#include "alloc.h"
#include "checksum.h"
#include "io_types.h"
#define to_wbio(_bio) \
@ -12,6 +14,9 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
@ -20,14 +25,15 @@ enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
BCH_WRITE_FLUSH = (1 << 2),
BCH_WRITE_DATA_COMPRESSED = (1 << 3),
BCH_WRITE_THROTTLE = (1 << 4),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
BCH_WRITE_DATA_ENCODED = (1 << 3),
BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6),
BCH_WRITE_DONE = (1 << 7),
BCH_WRITE_LOOPED = (1 << 8),
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7),
BCH_WRITE_DONE = (1 << 8),
BCH_WRITE_LOOPED = (1 << 9),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -36,11 +42,60 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
struct disk_reservation,
struct bch_devs_mask *,
unsigned long,
struct bpos, u64 *, unsigned);
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
? op->c->copygc_wq
: op->c->wq;
}
int bch2_write_index_default(struct bch_write_op *);
static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
{
op->c = c;
op->io_wq = index_update_wq(op);
op->flags = 0;
op->written = 0;
op->error = 0;
op->csum_type = bch2_data_checksum_type(c);
op->compression_type =
bch2_compression_opt_to_type(c->opts.compression);
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
op->open_buckets_nr = 0;
op->devs_have.nr = 0;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
op->devs = NULL;
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
op->journal_seq = 0;
op->index_update_fn = bch2_write_index_default;
}
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct disk_reservation res,
struct bch_devs_mask *devs,
struct write_point_specifier write_point,
struct bpos pos,
u64 *journal_seq, unsigned flags)
{
__bch2_write_op_init(op, c);
op->flags = flags;
op->nr_replicas = res.nr_replicas;
op->pos = pos;
op->res = res;
op->devs = devs;
op->write_point = write_point;
if (journal_seq) {
op->journal_seq_p = journal_seq;
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
}
}
void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
@ -51,14 +106,13 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
return wbio;
}
void bch2_wake_delayed_writes(unsigned long data);
struct bch_devs_mask;
struct cache_promote_op;
struct extent_pick_ptr;
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
struct bkey_s_c_extent e, struct extent_pick_ptr *,
unsigned);
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
u64, struct bch_devs_mask *, unsigned);
@ -66,21 +120,22 @@ enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
BCH_READ_NODECODE = 1 << 3,
/* internal: */
BCH_READ_MUST_BOUNCE = 1 << 3,
BCH_READ_MUST_CLONE = 1 << 4,
BCH_READ_IN_RETRY = 1 << 5,
BCH_READ_MUST_BOUNCE = 1 << 4,
BCH_READ_MUST_CLONE = 1 << 5,
BCH_READ_IN_RETRY = 1 << 6,
};
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
struct bkey_s_c k,
struct bkey_s_c_extent e,
struct extent_pick_ptr *pick,
unsigned flags)
{
rbio->_state = 0;
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
}
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,

View File

@ -1,20 +1,16 @@
#ifndef _BCACHEFS_IO_TYPES_H
#define _BCACHEFS_IO_TYPES_H
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
#include "extents_types.h"
#include "keylist_types.h"
#include "super_types.h"
#include <linux/llist.h>
#include <linux/workqueue.h>
struct extent_pick_ptr {
struct bch_extent_crc128 crc;
struct bch_extent_ptr ptr;
struct bch_dev *ca;
};
struct bch_read_bio {
struct bch_fs *c;
@ -44,26 +40,22 @@ struct bch_read_bio {
struct {
u8 bounce:1,
split:1,
process_context:1,
retry:2;
narrow_crcs:1,
retry:2,
context:2;
};
u8 _state;
};
struct bch_devs_list devs_have;
struct extent_pick_ptr pick;
/* start pos of data we read (may not be pos of data we want) */
struct bpos pos;
struct bversion version;
struct promote_op *promote;
/*
* If we have to retry the read (IO error, checksum failure, read stale
* data (raced with allocator), we retry the portion of the parent bio
* that failed (i.e. this bio's portion, bvec_iter).
*
* But we need to stash the inode somewhere:
*/
u64 inode;
struct work_struct work;
struct bio bio;
@ -98,36 +90,33 @@ struct bch_write_op {
struct bch_fs *c;
struct workqueue_struct *io_wq;
unsigned written; /* sectors */
short error;
u16 flags;
u16 written; /* sectors */
s8 error;
unsigned csum_type:4;
unsigned compression_type:4;
unsigned nr_replicas:4;
unsigned nr_replicas_required:4;
unsigned alloc_reserve:4;
unsigned nonce:14;
u8 open_buckets_nr;
struct bch_devs_list devs_have;
u16 target;
u16 nonce;
struct bpos pos;
struct bversion version;
/* For BCH_WRITE_DATA_COMPRESSED: */
struct bch_extent_crc128 crc;
unsigned size;
/* For BCH_WRITE_DATA_ENCODED: */
struct bch_extent_crc_unpacked crc;
struct bch_devs_mask *devs;
unsigned long write_point;
struct write_point_specifier write_point;
struct disk_reservation res;
union {
u8 open_buckets[16];
struct {
struct bch_write_op *next;
unsigned long expires;
};
};
/*
* If caller wants to flush but hasn't passed us a journal_seq ptr, we

View File

@ -464,7 +464,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
if (invalid) {
bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
bkey_i_to_s_c(k));
mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf);
mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
type, invalid, buf);
le16_add_cpu(&entry->u64s, -k->k.u64s);
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@ -1568,35 +1569,31 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq);
spin_unlock(&j->lock);
while (ja->nr < nr) {
/* must happen under journal lock, to avoid racing with gc: */
long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC);
if (b < 0) {
if (!closure_wait(&c->freelist_wait, &cl)) {
spin_unlock(&j->lock);
struct open_bucket *ob;
size_t bucket;
int ob_idx;
ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
if (ob_idx < 0) {
if (!closure_wait(&c->freelist_wait, &cl))
closure_sync(&cl);
spin_lock(&j->lock);
}
continue;
}
bch2_mark_metadata_bucket(ca, &ca->buckets[b],
BUCKET_JOURNAL, false);
bch2_mark_alloc_bucket(ca, &ca->buckets[b], false);
ob = c->open_buckets + ob_idx;
bucket = sector_to_bucket(ca, ob->ptr.offset);
memmove(ja->buckets + ja->last_idx + 1,
ja->buckets + ja->last_idx,
(ja->nr - ja->last_idx) * sizeof(u64));
memmove(ja->bucket_seq + ja->last_idx + 1,
ja->bucket_seq + ja->last_idx,
(ja->nr - ja->last_idx) * sizeof(u64));
memmove(journal_buckets->buckets + ja->last_idx + 1,
journal_buckets->buckets + ja->last_idx,
(ja->nr - ja->last_idx) * sizeof(u64));
spin_lock(&j->lock);
__array_insert_item(ja->buckets, ja->nr, ja->last_idx);
__array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
__array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
ja->buckets[ja->last_idx] = b;
journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
ja->buckets[ja->last_idx] = bucket;
ja->bucket_seq[ja->last_idx] = 0;
journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
if (ja->last_idx < ja->nr) {
if (ja->cur_idx >= ja->last_idx)
@ -1604,10 +1601,15 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
ja->last_idx++;
}
ja->nr++;
}
spin_unlock(&j->lock);
bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket],
BUCKET_JOURNAL,
gc_phase(GC_PHASE_SB), 0);
bch2_open_bucket_put(c, ob);
}
BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
bch2_write_super(c);
@ -1623,6 +1625,8 @@ err:
if (!ret)
bch2_dev_allocator_add(c, ca);
closure_sync(&cl);
return ret;
}

View File

@ -7,8 +7,7 @@ int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
void bch2_keylist_pop_front(struct keylist *);
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys,
size_t nr_inline_u64s)
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
{
l->top_p = l->keys_p = inline_keys;
}
@ -17,7 +16,7 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
{
if (l->keys_p != inline_keys)
kfree(l->keys_p);
memset(l, 0, sizeof(*l));
bch2_keylist_init(l, inline_keys);
}
static inline void bch2_keylist_push(struct keylist *l)

View File

@ -13,31 +13,16 @@
#include "move.h"
#include "super-io.h"
static int issue_migration_move(struct bch_dev *ca,
struct moving_context *ctxt,
struct bch_devs_mask *devs,
struct bkey_s_c k)
static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
{
struct bch_fs *c = ca->fs;
struct disk_reservation res;
struct bch_dev *ca = arg;
const struct bch_extent_ptr *ptr;
int ret;
if (bch2_disk_reservation_get(c, &res, k.k->size, 0))
return -ENOSPC;
extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
extent_for_each_ptr(e, ptr)
if (ptr->dev == ca->dev_idx)
goto found;
return true;
BUG();
found:
/* XXX: we need to be doing something with the disk reservation */
ret = bch2_data_move(c, ctxt, devs, k, ptr);
if (ret)
bch2_disk_reservation_put(c, &res);
return ret;
return false;
}
#define MAX_DATA_OFF_ITER 10
@ -58,10 +43,11 @@ found:
int bch2_move_data_off_device(struct bch_dev *ca)
{
struct moving_context ctxt;
struct bch_fs *c = ca->fs;
struct btree_iter iter;
struct bkey_s_c k;
u64 keys_moved, sectors_moved;
unsigned pass = 0;
u64 seen_key_count;
int ret = 0;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
@ -69,12 +55,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
return 0;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
__set_bit(ca->dev_idx, ctxt.avoid.d);
/*
* In theory, only one pass should be necessary as we've
* quiesced all writes before calling this.
@ -91,69 +71,43 @@ int bch2_move_data_off_device(struct bch_dev *ca)
* Thus this scans the tree one more time than strictly necessary,
* but that can be viewed as a verification pass.
*/
do {
struct btree_iter iter;
struct bkey_s_c k;
seen_key_count = 0;
atomic_set(&ctxt.error_count, 0);
atomic_set(&ctxt.error_flags, 0);
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH);
while (!bch2_move_ctxt_wait(&ctxt) &&
(k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k) ||
!bch2_extent_has_device(bkey_s_c_to_extent(k),
ca->dev_idx))
goto next;
ret = issue_migration_move(ca, &ctxt, NULL, k);
if (ret == -ENOMEM) {
bch2_btree_iter_unlock(&iter);
/*
* memory allocation failure, wait for some IO
* to finish
*/
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
ret = bch2_move_data(c, NULL,
SECTORS_IN_FLIGHT_PER_DEVICE,
NULL,
writepoint_hashed((unsigned long) current),
0,
ca->dev_idx,
migrate_pred, ca,
&keys_moved,
&sectors_moved);
if (ret) {
bch_err(c, "error migrating data: %i", ret);
return ret;
}
if (ret == -ENOSPC)
break;
BUG_ON(ret);
} while (keys_moved && pass++ < MAX_DATA_OFF_ITER);
seen_key_count++;
if (keys_moved) {
bch_err(c, "unable to migrate all data in %d iterations",
MAX_DATA_OFF_ITER);
return -1;
}
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
if (!bkey_extent_is_data(k.k))
continue;
next:
if (bkey_extent_is_data(k.k)) {
ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
BCH_DATA_USER);
if (ret)
if (ret) {
bch_err(c, "error migrating data %i from check_mark_super()", ret);
break;
}
bch2_btree_iter_advance_pos(&iter);
bch2_btree_iter_cond_resched(&iter);
}
bch2_btree_iter_unlock(&iter);
bch2_move_ctxt_exit(&ctxt);
if (ret)
goto err;
} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
if (seen_key_count) {
pr_err("Unable to migrate all data in %d iterations.",
MAX_DATA_OFF_ITER);
ret = -1;
goto err;
}
err:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
@ -167,14 +121,11 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id)
{
struct btree_iter iter;
struct closure cl;
struct btree *b;
int ret;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
closure_init_stack(&cl);
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);

View File

@ -9,41 +9,38 @@
#include "keylist.h"
#include <linux/ioprio.h>
#include <linux/kthread.h>
#include <trace/events/bcachefs.h>
static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
struct bkey_s_extent e,
struct bch_extent_ptr ptr)
{
struct bch_extent_ptr *ptr2;
struct bch_dev *ca = c->devs[ptr.dev];
struct moving_io {
struct list_head list;
struct closure cl;
bool read_completed;
unsigned sectors;
extent_for_each_ptr(e, ptr2)
if (ptr2->dev == ptr.dev &&
ptr2->gen == ptr.gen &&
PTR_BUCKET_NR(ca, ptr2) ==
PTR_BUCKET_NR(ca, &ptr))
return ptr2;
struct bch_read_bio rbio;
return NULL;
}
struct migrate_write write;
/* Must be last since it is variable size */
struct bio_vec bi_inline_vecs[0];
};
static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
struct bkey_s_extent e)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_ptr *ret;
struct moving_context {
/* Closure for waiting on all reads and writes to complete */
struct closure cl;
if (m->move)
ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
else
extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
break;
/* Key and sector moves issued, updated from submission context */
u64 keys_moved;
u64 sectors_moved;
atomic64_t sectors_raced;
return ret;
}
struct list_head reads;
atomic_t sectors_in_flight;
wait_queue_head_t wait;
};
static int bch2_migrate_index_update(struct bch_write_op *op)
{
@ -59,71 +56,78 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
BTREE_ITER_INTENT);
while (1) {
struct bkey_s_extent insert =
bkey_i_to_s_extent(bch2_keylist_front(keys));
struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
struct bkey_i_extent *insert, *new =
bkey_i_to_extent(bch2_keylist_front(keys));
BKEY_PADDED(k) _new, _insert;
struct bch_extent_ptr *ptr;
struct bkey_s_extent e;
BKEY_PADDED(k) new;
struct bch_extent_crc_unpacked crc;
bool did_work = false;
if (!k.k) {
if (btree_iter_err(k)) {
ret = bch2_btree_iter_unlock(&iter);
break;
}
if (!bkey_extent_is_data(k.k))
if (bversion_cmp(k.k->version, new->k.version) ||
!bkey_extent_is_data(k.k) ||
!bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
m->ptr, m->offset))
goto nomatch;
bkey_reassemble(&new.k, k);
bch2_cut_front(iter.pos, &new.k);
bch2_cut_back(insert.k->p, &new.k.k);
e = bkey_i_to_s_extent(&new.k);
bkey_reassemble(&_insert.k, k);
insert = bkey_i_to_extent(&_insert.k);
/* hack - promotes can race: */
if (m->promote)
extent_for_each_ptr(insert, ptr)
if (bch2_extent_has_device(e.c, ptr->dev))
goto nomatch;
bkey_copy(&_new.k, bch2_keylist_front(keys));
new = bkey_i_to_extent(&_new.k);
ptr = bch2_migrate_matching_ptr(m, e);
if (ptr) {
int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c);
unsigned insert_flags =
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL;
bch2_cut_front(iter.pos, &insert->k_i);
bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k);
/* copygc uses btree node reserve: */
if (m->move)
insert_flags |= BTREE_INSERT_USE_RESERVE;
if (m->move_dev >= 0 &&
(ptr = (struct bch_extent_ptr *)
bch2_extent_has_device(extent_i_to_s_c(insert),
m->move_dev)))
bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
if (m->move) {
nr_new_dirty -= !ptr->cached;
__bch2_extent_drop_ptr(e, ptr);
extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
/*
* raced with another move op? extent already
* has a pointer to the device we just wrote
* data to
*/
continue;
}
BUG_ON(nr_new_dirty < 0);
bch2_extent_crc_append(insert, crc);
extent_ptr_append(insert, *ptr);
did_work = true;
}
memcpy_u64s(extent_entry_last(e),
insert.v,
bkey_val_u64s(insert.k));
e.k->u64s += bkey_val_u64s(insert.k);
if (!did_work)
goto nomatch;
bch2_extent_narrow_crcs(e);
bch2_extent_drop_redundant_crcs(e);
bch2_extent_normalize(c, e.s);
bch2_extent_mark_replicas_cached(c, e, nr_new_dirty);
bch2_extent_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, extent_i_to_s(insert).s);
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
ret = bch2_btree_insert_at(c, &op->res,
NULL, op_journal_seq(op),
insert_flags,
BTREE_INSERT_ENTRY(&iter, &new.k));
if (ret && ret != -EINTR)
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
m->btree_insert_flags,
BTREE_INSERT_ENTRY(&iter, &insert->k_i));
if (!ret)
atomic_long_inc(&c->extent_migrate_done);
if (ret == -EINTR)
ret = 0;
if (ret)
break;
} else {
nomatch:
bch2_btree_iter_advance_pos(&iter);
}
next:
while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
bch2_keylist_pop_front(keys);
if (bch2_keylist_empty(keys))
@ -131,96 +135,83 @@ nomatch:
}
bch2_cut_front(iter.pos, bch2_keylist_front(keys));
continue;
nomatch:
if (m->ctxt)
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->sectors_raced);
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
bch2_btree_iter_advance_pos(&iter);
goto next;
}
out:
bch2_btree_iter_unlock(&iter);
return ret;
}
void bch2_migrate_write_init(struct bch_fs *c,
struct migrate_write *m,
struct bch_devs_mask *devs,
struct bkey_s_c k,
const struct bch_extent_ptr *move_ptr,
unsigned flags)
void bch2_migrate_write_init(struct migrate_write *m,
struct bch_read_bio *rbio)
{
bkey_reassemble(&m->key, k);
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
m->promote = false;
m->move = move_ptr != NULL;
if (move_ptr)
m->move_ptr = *move_ptr;
m->ptr = rbio->pick.ptr;
m->offset = rbio->pos.offset - rbio->pick.crc.offset;
m->op.devs_have = rbio->devs_have;
m->op.pos = rbio->pos;
m->op.version = rbio->version;
m->op.crc = rbio->pick.crc;
if (bkey_extent_is_cached(k.k) ||
(move_ptr && move_ptr->cached))
flags |= BCH_WRITE_CACHED;
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
m->op.csum_type = m->op.crc.csum_type;
}
bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
devs, (unsigned long) current,
bkey_start_pos(k.k), NULL,
flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
if (m->move_dev >= 0)
bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
if (m->move)
if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_MOVINGGC;
m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k));
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
BCH_WRITE_DATA_ENCODED;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
m->op.nr_replicas = 1;
m->op.nr_replicas_required = 1;
m->op.index_update_fn = bch2_migrate_index_update;
}
static void migrate_bio_init(struct moving_io *io, struct bio *bio,
unsigned sectors)
static void move_free(struct closure *cl)
{
bio_init(bio, io->bi_inline_vecs,
DIV_ROUND_UP(sectors, PAGE_SECTORS));
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
bio->bi_iter.bi_size = sectors << 9;
bio->bi_private = &io->cl;
bch2_bio_map(bio, NULL);
}
static void moving_io_free(struct moving_io *io)
{
struct moving_context *ctxt = io->ctxt;
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
struct bio_vec *bv;
int i;
atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
wake_up(&ctxt->wait);
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
if (bv->bv_page)
__free_page(bv->bv_page);
atomic_sub(io->sectors, &ctxt->sectors_in_flight);
wake_up(&ctxt->wait);
kfree(io);
}
static void moving_error(struct moving_context *ctxt, unsigned flag)
{
atomic_inc(&ctxt->error_count);
//atomic_or(flag, &ctxt->error_flags);
}
static void moving_write_done(struct closure *cl)
static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (io->write.op.error)
moving_error(io->ctxt, MOVING_FLAG_WRITE);
//if (io->replace.failures)
// trace_copy_collision(q, &io->key.k);
moving_io_free(io);
if (likely(!io->rbio.bio.bi_error)) {
bch2_migrate_write_init(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
}
static void write_moving(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct bch_write_op *op = &io->write.op;
closure_call(&op->cl, bch2_write, NULL, &io->cl);
closure_return_with_destructor(&io->cl, moving_write_done);
closure_return_with_destructor(cl, move_free);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@ -231,16 +222,10 @@ static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
return io && io->read_completed ? io : NULL;
}
static void read_moving_endio(struct bio *bio)
static void move_read_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->ctxt;
trace_move_read_done(&io->write.key.k);
if (bio->bi_error)
moving_error(io->ctxt, MOVING_FLAG_READ);
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
io->read_completed = true;
if (next_pending_write(ctxt))
@ -249,58 +234,81 @@ static void read_moving_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
int bch2_data_move(struct bch_fs *c,
static int bch2_move_extent(struct bch_fs *c,
struct moving_context *ctxt,
struct bch_devs_mask *devs,
struct bkey_s_c k,
const struct bch_extent_ptr *move_ptr)
struct write_point_specifier wp,
int btree_insert_flags,
int move_device,
struct bkey_s_c k)
{
struct extent_pick_ptr pick;
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
unsigned sectors = k.k->size, pages;
bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
bch2_extent_pick_ptr(c, k, NULL, &pick);
if (IS_ERR_OR_NULL(pick.ca))
return pick.ca ? PTR_ERR(pick.ca) : 0;
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
/* write path might have to decompress data: */
extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc)
sectors = max_t(unsigned, sectors, crc.uncompressed_size);
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
io = kzalloc(sizeof(struct moving_io) +
sizeof(struct bio_vec) * pages, GFP_KERNEL);
if (!io)
return -ENOMEM;
goto err;
io->ctxt = ctxt;
io->write.ctxt = ctxt;
io->sectors = k.k->size;
migrate_bio_init(io, &io->rbio.bio, k.k->size);
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->write.op.wbio.bio,
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
bch2_bio_map(&io->write.op.wbio.bio, NULL);
if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) {
kfree(io);
goto err;
}
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = read_moving_endio;
io->rbio.bio.bi_end_io = move_read_endio;
if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
kfree(io);
return -ENOMEM;
}
migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
trace_move_read(&io->write.key.k);
__bch2_write_op_init(&io->write.op, c);
io->write.btree_insert_flags = btree_insert_flags;
io->write.move_dev = move_device;
io->write.op.devs = devs;
io->write.op.write_point = wp;
ctxt->keys_moved++;
ctxt->sectors_moved += k.k->size;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
atomic_add(k.k->size, &ctxt->sectors_in_flight);
trace_move_extent(k.k);
atomic_add(io->sectors, &ctxt->sectors_in_flight);
list_add_tail(&io->list, &ctxt->reads);
/*
* dropped by read_moving_endio() - guards against use after free of
* dropped by move_read_endio() - guards against use after free of
* ctxt when doing wakeup
*/
closure_get(&io->ctxt->cl);
bch2_read_extent(c, &io->rbio, k, &pick, 0);
closure_get(&ctxt->cl);
bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k),
&pick, BCH_READ_NODECODE);
return 0;
err:
trace_move_alloc_fail(k.k);
return -ENOMEM;
}
static void do_pending_writes(struct moving_context *ctxt)
@ -309,14 +317,7 @@ static void do_pending_writes(struct moving_context *ctxt)
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
if (io->rbio.bio.bi_error) {
moving_io_free(io);
continue;
}
trace_move_write(&io->write.key.k);
closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
}
}
@ -330,18 +331,7 @@ do { \
next_pending_write(_ctxt) || (_cond)); \
} while (1)
int bch2_move_ctxt_wait(struct moving_context *ctxt)
{
move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->sectors_in_flight) <
ctxt->max_sectors_in_flight);
return ctxt->rate
? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate)
: 0;
}
void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
@ -350,7 +340,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
}
void bch2_move_ctxt_exit(struct moving_context *ctxt)
static void bch2_move_ctxt_exit(struct moving_context *ctxt)
{
move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
closure_sync(&ctxt->cl);
@ -359,16 +349,92 @@ void bch2_move_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
}
void bch2_move_ctxt_init(struct moving_context *ctxt,
struct bch_ratelimit *rate,
unsigned max_sectors_in_flight)
static void bch2_move_ctxt_init(struct moving_context *ctxt)
{
memset(ctxt, 0, sizeof(*ctxt));
closure_init_stack(&ctxt->cl);
ctxt->rate = rate;
ctxt->max_sectors_in_flight = max_sectors_in_flight;
INIT_LIST_HEAD(&ctxt->reads);
init_waitqueue_head(&ctxt->wait);
}
int bch2_move_data(struct bch_fs *c,
struct bch_ratelimit *rate,
unsigned sectors_in_flight,
struct bch_devs_mask *devs,
struct write_point_specifier wp,
int btree_insert_flags,
int move_device,
move_pred_fn pred, void *arg,
u64 *keys_moved,
u64 *sectors_moved)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt;
struct btree_iter iter;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
int ret = 0;
bch2_move_ctxt_init(&ctxt);
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH);
if (rate)
bch2_ratelimit_reset(rate);
while (!kthread || !(ret = kthread_should_stop())) {
if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
bch2_btree_iter_unlock(&iter);
move_ctxt_wait_event(&ctxt,
atomic_read(&ctxt.sectors_in_flight) <
sectors_in_flight);
}
if (rate &&
bch2_ratelimit_delay(rate) &&
(bch2_btree_iter_unlock(&iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break;
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
ret = btree_iter_err(k);
if (ret)
break;
if (!bkey_extent_is_data(k.k) ||
!pred(arg, bkey_s_c_to_extent(k)))
goto next;
/* unlock before doing IO: */
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&iter);
if (bch2_move_extent(c, &ctxt, devs, wp,
btree_insert_flags,
move_device, k)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
}
if (rate)
bch2_ratelimit_increment(rate, k.k->size);
next:
bch2_btree_iter_advance_pos(&iter);
bch2_btree_iter_cond_resched(&iter);
}
bch2_btree_iter_unlock(&iter);
bch2_move_ctxt_exit(&ctxt);
trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
*keys_moved = ctxt.keys_moved;
*sectors_moved = ctxt.sectors_moved;
return ret;
}

View File

@ -4,77 +4,31 @@
#include "buckets.h"
#include "io_types.h"
enum moving_flag_bitnos {
MOVING_FLAG_BITNO_READ = 0,
MOVING_FLAG_BITNO_WRITE,
};
#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ)
#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE)
struct bch_read_bio;
struct moving_context;
struct migrate_write {
BKEY_PADDED(key);
bool promote;
bool move;
struct bch_extent_ptr move_ptr;
struct moving_context *ctxt;
/* what we read: */
struct bch_extent_ptr ptr;
u64 offset;
int move_dev;
int btree_insert_flags;
struct bch_write_op op;
};
void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
struct bch_devs_mask *, struct bkey_s_c,
const struct bch_extent_ptr *, unsigned);
void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
struct moving_context {
/* Closure for waiting on all reads and writes to complete */
struct closure cl;
typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
/* Number and types of errors reported */
atomic_t error_count;
atomic_t error_flags;
/* Key and sector moves issued, updated from submission context */
u64 keys_moved;
u64 sectors_moved;
/* Rate-limiter counting submitted reads */
struct bch_ratelimit *rate;
/* Try to avoid reading the following device */
struct bch_devs_mask avoid;
struct list_head reads;
/* Configuration */
unsigned max_sectors_in_flight;
atomic_t sectors_in_flight;
wait_queue_head_t wait;
};
struct moving_io {
struct list_head list;
struct rb_node node;
struct closure cl;
struct moving_context *ctxt;
struct migrate_write write;
bool read_completed;
struct bch_read_bio rbio;
/* Must be last since it is variable size */
struct bio_vec bi_inline_vecs[0];
};
int bch2_data_move(struct bch_fs *, struct moving_context *,
struct bch_devs_mask *, struct bkey_s_c,
const struct bch_extent_ptr *);
int bch2_move_ctxt_wait(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
void bch2_move_ctxt_exit(struct moving_context *);
void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
unsigned);
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
unsigned, struct bch_devs_mask *,
struct write_point_specifier,
int, int, move_pred_fn, void *,
u64 *, u64 *);
#endif /* _BCACHEFS_MOVE_H */

View File

@ -6,6 +6,7 @@
#include "bcachefs.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
#include "extents.h"
@ -23,137 +24,63 @@
#include <linux/sort.h>
#include <linux/wait.h>
/* Moving GC - IO loop */
/*
* We can't use the entire copygc reserve in one iteration of copygc: we may
* need the buckets we're freeing up to go back into the copygc reserve to make
* forward progress, but if the copygc reserve is full they'll be available for
* any allocation - and it's possible that in a given iteration, we free up most
* of the buckets we're going to free before we allocate most of the buckets
* we're going to allocate.
*
* If we only use half of the reserve per iteration, then in steady state we'll
* always have room in the reserve for the buckets we're going to need in the
* next iteration:
*/
#define COPYGC_BUCKETS_PER_ITER(ca) \
((ca)->free[RESERVE_MOVINGGC].size / 2)
static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
/*
* Max sectors to move per iteration: Have to take into account internal
* fragmentation from the multiple write points for each generation:
*/
#define COPYGC_SECTORS_PER_ITER(ca) \
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
static inline int sectors_used_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
{
const struct bucket_heap_entry *l = _l;
const struct bucket_heap_entry *r = _r;
if (l->bucket < r->bucket)
return -1;
if (l->bucket > r->bucket)
return 1;
return 0;
return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
}
static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
struct bkey_s_c k)
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
{
bucket_heap *h = &ca->copygc_heap;
const struct bch_extent_ptr *ptr;
const struct copygc_heap_entry *l = _l;
const struct copygc_heap_entry *r = _r;
if (bkey_extent_is_data(k.k) &&
(ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
ca->dev_idx))) {
struct bucket_heap_entry search = {
.bucket = PTR_BUCKET_NR(ca, ptr)
};
return (l->offset > r->offset) - (l->offset < r->offset);
}
size_t i = eytzinger0_find(h->data, h->used,
static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
{
struct bch_dev *ca = arg;
copygc_heap *h = &ca->copygc_heap;
const struct bch_extent_ptr *ptr =
bch2_extent_has_device(e, ca->dev_idx);
if (ptr) {
struct copygc_heap_entry search = { .offset = ptr->offset };
size_t i = eytzinger0_find_le(h->data, h->used,
sizeof(h->data[0]),
bucket_idx_cmp, &search);
bucket_offset_cmp, &search);
if (i < h->used)
return ptr;
return (i >= 0 &&
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
ptr->gen == h->data[i].mark.gen);
}
return NULL;
}
static int issue_moving_gc_move(struct bch_dev *ca,
struct moving_context *ctxt,
struct bkey_s_c k)
{
struct bch_fs *c = ca->fs;
const struct bch_extent_ptr *ptr;
int ret;
ptr = moving_pred(ca, k);
if (!ptr) /* We raced - bucket's been reused */
return 0;
ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
if (!ret)
trace_gc_copy(k.k);
else
trace_moving_gc_alloc_fail(c, k.k->size);
return ret;
}
static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
u64 sectors_to_move)
{
struct bch_fs *c = ca->fs;
bucket_heap *h = &ca->copygc_heap;
struct moving_context ctxt;
struct btree_iter iter;
struct bkey_s_c k;
u64 sectors_not_moved = 0;
size_t buckets_not_moved = 0;
struct bucket_heap_entry *i;
bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
SECTORS_IN_FLIGHT_PER_DEVICE);
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH);
while (1) {
if (kthread_should_stop())
goto out;
if (bch2_move_ctxt_wait(&ctxt))
goto out;
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
if (btree_iter_err(k))
goto out;
if (!moving_pred(ca, k))
goto next;
if (issue_moving_gc_move(ca, &ctxt, k)) {
bch2_btree_iter_unlock(&iter);
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
}
next:
bch2_btree_iter_advance_pos(&iter);
//bch2_btree_iter_cond_resched(&iter);
/* unlock before calling moving_context_wait() */
bch2_btree_iter_unlock(&iter);
cond_resched();
}
bch2_btree_iter_unlock(&iter);
bch2_move_ctxt_exit(&ctxt);
trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
buckets_to_move);
/* don't check this if we bailed out early: */
for (i = h->data; i < h->data + h->used; i++) {
struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
sectors_not_moved += bucket_sectors_used(m);
buckets_not_moved++;
}
}
if (sectors_not_moved)
bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
sectors_not_moved, sectors_to_move,
buckets_not_moved, buckets_to_move);
return;
out:
bch2_btree_iter_unlock(&iter);
bch2_move_ctxt_exit(&ctxt);
trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
buckets_to_move);
return false;
}
static bool have_copygc_reserve(struct bch_dev *ca)
@ -168,38 +95,17 @@ static bool have_copygc_reserve(struct bch_dev *ca)
return ret;
}
static inline int sectors_used_cmp(bucket_heap *heap,
struct bucket_heap_entry l,
struct bucket_heap_entry r)
static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
{
return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
}
static void bch2_moving_gc(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
copygc_heap *h = &ca->copygc_heap;
struct copygc_heap_entry e, *i;
struct bucket *g;
u64 sectors_to_move = 0;
size_t buckets_to_move, buckets_unused = 0;
struct bucket_heap_entry e, *i;
int reserve_sectors;
u64 keys_moved, sectors_moved;
u64 sectors_to_move = 0, sectors_not_moved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
int ret;
if (!have_copygc_reserve(ca)) {
struct closure cl;
closure_init_stack(&cl);
while (1) {
closure_wait(&c->freelist_wait, &cl);
if (have_copygc_reserve(ca))
break;
closure_sync(&cl);
}
closure_wake_up(&c->freelist_wait);
}
reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
trace_moving_gc_start(ca);
closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
/*
* Find buckets with lowest sector counts, skipping completely
@ -213,48 +119,73 @@ static void bch2_moving_gc(struct bch_dev *ca)
* them:
*/
down_read(&c->gc_lock);
ca->copygc_heap.used = 0;
h->used = 0;
for_each_bucket(g, ca) {
struct bucket_mark m = READ_ONCE(g->mark);
struct bucket_heap_entry e = { g - ca->buckets, m };
if (bucket_unused(m)) {
buckets_unused++;
continue;
}
struct copygc_heap_entry e;
if (m.owned_by_allocator ||
m.data_type != BUCKET_DATA)
m.data_type != BUCKET_DATA ||
!bucket_sectors_used(m) ||
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
if (bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
e = (struct copygc_heap_entry) {
.offset = bucket_to_sector(ca, g - ca->buckets),
.mark = m
};
heap_add_or_replace(h, e, -sectors_used_cmp);
}
up_read(&c->gc_lock);
for (i = ca->copygc_heap.data;
i < ca->copygc_heap.data + ca->copygc_heap.used;
i++)
for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += bucket_sectors_used(i->mark);
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
sectors_to_move -= bucket_sectors_used(e.mark);
}
buckets_to_move = ca->copygc_heap.used;
buckets_to_move = h->used;
eytzinger0_sort(ca->copygc_heap.data,
ca->copygc_heap.used,
sizeof(ca->copygc_heap.data[0]),
bucket_idx_cmp, NULL);
if (!buckets_to_move)
return;
read_moving(ca, buckets_to_move, sectors_to_move);
eytzinger0_sort(h->data, h->used,
sizeof(h->data[0]),
bucket_offset_cmp, NULL);
ret = bch2_move_data(c, &ca->copygc_pd.rate,
SECTORS_IN_FLIGHT_PER_DEVICE,
&ca->self,
writepoint_ptr(&ca->copygc_write_point),
BTREE_INSERT_USE_RESERVE,
ca->dev_idx,
copygc_pred, ca,
&keys_moved,
&sectors_moved);
for (i = h->data; i < h->data + h->used; i++) {
size_t bucket = sector_to_bucket(ca, i->offset);
struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark);
if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
sectors_not_moved += bucket_sectors_used(m);
buckets_not_moved++;
}
}
static int bch2_moving_gc_thread(void *arg)
if (sectors_not_moved && !ret)
bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
sectors_not_moved, sectors_to_move,
buckets_not_moved, buckets_to_move);
trace_copygc(ca,
sectors_moved, sectors_not_moved,
buckets_to_move, buckets_not_moved);
}
static int bch2_copygc_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
@ -273,7 +204,7 @@ static int bch2_moving_gc_thread(void *arg)
* don't start copygc until less than half the gc reserve is
* available:
*/
available = dev_buckets_available(ca);
available = dev_buckets_available(c, ca);
want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
c->opts.gc_reserve_percent, 200);
if (available > want) {
@ -283,46 +214,46 @@ static int bch2_moving_gc_thread(void *arg)
continue;
}
bch2_moving_gc(ca);
bch2_copygc(c, ca);
}
return 0;
}
void bch2_moving_gc_stop(struct bch_dev *ca)
void bch2_copygc_stop(struct bch_dev *ca)
{
ca->moving_gc_pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
ca->copygc_pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&ca->copygc_pd.rate);
if (ca->moving_gc_read)
kthread_stop(ca->moving_gc_read);
ca->moving_gc_read = NULL;
if (ca->copygc_thread)
kthread_stop(ca->copygc_thread);
ca->copygc_thread = NULL;
}
int bch2_moving_gc_start(struct bch_dev *ca)
int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
{
struct task_struct *t;
BUG_ON(ca->moving_gc_read);
BUG_ON(ca->copygc_thread);
if (ca->fs->opts.nochanges)
if (c->opts.nochanges)
return 0;
if (bch2_fs_init_fault("moving_gc_start"))
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
if (IS_ERR(t))
return PTR_ERR(t);
ca->moving_gc_read = t;
wake_up_process(ca->moving_gc_read);
ca->copygc_thread = t;
wake_up_process(ca->copygc_thread);
return 0;
}
void bch2_dev_moving_gc_init(struct bch_dev *ca)
void bch2_dev_copygc_init(struct bch_dev *ca)
{
bch2_pd_controller_init(&ca->moving_gc_pd);
ca->moving_gc_pd.d_term = 0;
bch2_pd_controller_init(&ca->copygc_pd);
ca->copygc_pd.d_term = 0;
}

View File

@ -1,30 +1,8 @@
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
/*
* We can't use the entire copygc reserve in one iteration of copygc: we may
* need the buckets we're freeing up to go back into the copygc reserve to make
* forward progress, but if the copygc reserve is full they'll be available for
* any allocation - and it's possible that in a given iteration, we free up most
* of the buckets we're going to free before we allocate most of the buckets
* we're going to allocate.
*
* If we only use half of the reserve per iteration, then in steady state we'll
* always have room in the reserve for the buckets we're going to need in the
* next iteration:
*/
#define COPYGC_BUCKETS_PER_ITER(ca) \
((ca)->free[RESERVE_MOVINGGC].size / 2)
/*
* Max sectors to move per iteration: Have to take into account internal
* fragmentation from the multiple write points for each generation:
*/
#define COPYGC_SECTORS_PER_ITER(ca) \
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
void bch2_moving_gc_stop(struct bch_dev *);
int bch2_moving_gc_start(struct bch_dev *);
void bch2_dev_moving_gc_init(struct bch_dev *);
void bch2_copygc_stop(struct bch_dev *);
int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
void bch2_dev_copygc_init(struct bch_dev *);
#endif /* _BCACHEFS_MOVINGGC_H */

View File

@ -425,6 +425,11 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
if (err)
return err;
if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
bch2_sb_get_crypt(sb) &&
BCH_SB_INITIALIZED(sb))
return "Incompatible extent nonces";
sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
return NULL;

View File

@ -20,6 +20,7 @@
#include "debug.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
#include "fsck.h"
#include "inode.h"
#include "io.h"
@ -209,7 +210,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_tiering_stop(c);
for_each_member_device(ca, c, i)
bch2_moving_gc_stop(ca);
bch2_copygc_stop(ca);
bch2_gc_thread_stop(c);
@ -258,12 +259,8 @@ void bch2_fs_read_only(struct bch_fs *c)
*/
percpu_ref_kill(&c->writes);
del_timer(&c->foreground_write_wakeup);
cancel_delayed_work(&c->pd_controllers_update);
c->foreground_write_pd.rate.rate = UINT_MAX;
bch2_wake_delayed_writes((unsigned long) c);
/*
* If we're not doing an emergency shutdown, we want to wait on
* outstanding writes to complete so they don't see spurious errors due
@ -348,9 +345,9 @@ const char *bch2_fs_read_write(struct bch_fs *c)
if (bch2_gc_thread_start(c))
goto err;
err = "error starting moving GC thread";
err = "error starting copygc thread";
for_each_rw_member(ca, c, i)
if (bch2_moving_gc_start(ca)) {
if (bch2_copygc_start(c, ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
@ -375,6 +372,7 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
bch2_fs_fsio_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
@ -411,7 +409,6 @@ static void bch2_fs_exit(struct bch_fs *c)
{
unsigned i;
del_timer_sync(&c->foreground_write_wakeup);
cancel_delayed_work_sync(&c->pd_controllers_update);
cancel_work_sync(&c->read_only_work);
@ -535,8 +532,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->tiering_enabled = 1;
c->tiering_percent = 10;
c->foreground_target_percent = 20;
c->journal.write_time = &c->journal_write_time;
c->journal.delay_time = &c->journal_delay_time;
c->journal.blocked_time = &c->journal_blocked_time;
@ -600,7 +595,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_cache_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
bch2_check_set_has_compressed_data(c, c->opts.compression))
bch2_check_set_has_compressed_data(c, c->opts.compression) ||
bch2_fs_fsio_init(c))
goto err;
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
@ -1105,8 +1101,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
spin_lock_init(&ca->freelist_lock);
bch2_dev_moving_gc_init(ca);
bch2_dev_copygc_init(ca);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@ -1224,10 +1222,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
lg_local_lock(&c->usage_lock);
if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
bch2_mark_dev_metadata(c, ca);
lg_local_unlock(&c->usage_lock);
bch2_mark_dev_superblock(c, ca, 0);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@ -1324,7 +1319,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
bch2_moving_gc_stop(ca);
bch2_copygc_stop(ca);
/*
* This stops new data writes (e.g. to existing open data
@ -1347,8 +1342,8 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
if (bch2_dev_allocator_start(ca))
return "error starting allocator thread";
if (bch2_moving_gc_start(ca))
return "error starting moving GC thread";
if (bch2_copygc_start(c, ca))
return "error starting copygc thread";
if (bch2_tiering_start(c))
return "error starting tiering thread";

View File

@ -35,6 +35,30 @@ static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
}
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
unsigned dev)
{
unsigned i;
for (i = 0; i < devs.nr; i++)
if (devs.devs[i] == dev)
return true;
return false;
}
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
unsigned dev)
{
unsigned i;
for (i = 0; i < devs->nr; i++)
if (devs->devs[i] == dev) {
array_remove_item(devs->devs, devs->nr, i);
return;
}
}
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask)
{

View File

@ -13,4 +13,33 @@ struct bch_devs_mask {
unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
};
struct bch_devs_list {
u8 nr;
u8 devs[BCH_REPLICAS_MAX];
};
struct bch_member_cpu {
u64 nbuckets; /* device size */
u16 first_bucket; /* index of first bucket used */
u16 bucket_size; /* sectors */
u8 state;
u8 tier;
u8 replacement;
u8 discard;
u8 data_allowed;
u8 valid;
};
struct bch_replicas_cpu_entry {
u8 data_type;
u8 devs[BCH_SB_MEMBERS_MAX / 8];
};
struct bch_replicas_cpu {
struct rcu_head rcu;
unsigned nr;
unsigned entry_size;
struct bch_replicas_cpu_entry entries[];
};
#endif /* _BCACHEFS_SUPER_TYPES_H */

View File

@ -161,8 +161,11 @@ read_attribute(meta_buckets);
read_attribute(alloc_buckets);
read_attribute(has_data);
read_attribute(alloc_debug);
write_attribute(wake_allocator);
read_attribute(read_realloc_races);
read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced);
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
@ -170,7 +173,6 @@ rw_attribute(journal_reclaim_delay_ms);
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
rw_attribute(foreground_write_ratelimit_enabled);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
@ -179,12 +181,9 @@ rw_attribute(tiering_enabled);
rw_attribute(tiering_percent);
sysfs_pd_controller_attribute(tiering);
sysfs_pd_controller_attribute(foreground_write);
rw_attribute(pd_controllers_update_seconds);
rw_attribute(foreground_target_percent);
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
@ -272,18 +271,18 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
if (k.k->type == BCH_EXTENT) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc;
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
if (crc.compression_type == BCH_COMPRESSION_NONE) {
nr_uncompressed_extents++;
uncompressed_sectors += e.k->size;
} else {
nr_compressed_extents++;
compressed_sectors_compressed +=
crc_compressed_size(e.k, crc);
crc.compressed_size;
compressed_sectors_uncompressed +=
crc_uncompressed_size(e.k, crc);
crc.uncompressed_size;
}
/* only looking at the first ptr */
@ -323,17 +322,17 @@ SHOW(bch2_fs)
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
sysfs_print(extent_migrate_done,
atomic_long_read(&c->extent_migrate_done));
sysfs_print(extent_migrate_raced,
atomic_long_read(&c->extent_migrate_raced));
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
sysfs_printf(foreground_write_ratelimit_enabled, "%i",
c->foreground_write_ratelimit_enabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
sysfs_print(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_print(foreground_target_percent, c->foreground_target_percent);
sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
sysfs_print(tiering_percent, c->tiering_percent);
@ -371,9 +370,6 @@ STORE(__bch2_fs)
sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
sysfs_strtoul(foreground_write_ratelimit_enabled,
c->foreground_write_ratelimit_enabled);
if (attr == &sysfs_btree_gc_periodic) {
ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
?: (ssize_t) size;
@ -389,8 +385,8 @@ STORE(__bch2_fs)
?: (ssize_t) size;
for_each_member_device(ca, c, i)
if (ca->moving_gc_read)
wake_up_process(ca->moving_gc_read);
if (ca->copygc_thread)
wake_up_process(ca->copygc_thread);
return ret;
}
@ -402,11 +398,8 @@ STORE(__bch2_fs)
return ret;
}
sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
sysfs_strtoul(tiering_percent, c->tiering_percent);
sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
@ -466,7 +459,6 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
&sysfs_foreground_target_percent,
&sysfs_tiering_percent,
&sysfs_compression_stats,
@ -494,17 +486,17 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_pins,
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
&sysfs_extent_migrate_raced,
&sysfs_trigger_journal_flush,
&sysfs_trigger_btree_coalesce,
&sysfs_trigger_gc,
&sysfs_prune_cache,
&sysfs_foreground_write_ratelimit_enabled,
&sysfs_copy_gc_enabled,
&sysfs_tiering_enabled,
sysfs_pd_controller_files(tiering),
sysfs_pd_controller_files(foreground_write),
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@ -710,17 +702,23 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
{
struct bch_fs *c = ca->fs;
struct bch_dev_usage stats = bch2_dev_usage_read(ca);
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n"
"free[RESERVE_BTREE]: %zu/%zu\n"
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
"free[RESERVE_NONE]: %zu/%zu\n"
"alloc: %llu/%llu\n"
"meta: %llu/%llu\n"
"dirty: %llu/%llu\n"
"available: %llu/%llu\n"
"buckets:\n"
" capacity: %llu\n"
" alloc: %llu\n"
" meta: %llu\n"
" dirty: %llu\n"
" available: %llu\n"
"sectors:\n"
" meta: %llu\n"
" dirty: %llu\n"
" cached: %llu\n"
"freelist_wait: %s\n"
"open buckets: %u/%u (reserved %u)\n"
"open_buckets_wait: %s\n",
@ -728,10 +726,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket,
stats.buckets[S_META], ca->mi.nbuckets - ca->mi.first_bucket,
stats.buckets[S_DIRTY], ca->mi.nbuckets - ca->mi.first_bucket,
__dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket,
ca->mi.nbuckets - ca->mi.first_bucket,
stats.buckets_alloc,
stats.buckets[S_META],
stats.buckets[S_DIRTY],
__dev_buckets_available(ca, stats),
stats.sectors[S_META],
stats.sectors[S_DIRTY],
stats.sectors_cached,
c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
c->open_buckets_wait.list.first ? "waiting" : "empty");
@ -769,7 +771,7 @@ SHOW(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
struct bch_dev_usage stats = bch2_dev_usage_read(ca);
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
char *out = buf, *end = buf + PAGE_SIZE;
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
@ -788,8 +790,8 @@ SHOW(bch2_dev)
sysfs_print(cached_buckets, stats.buckets_cached);
sysfs_print(meta_buckets, stats.buckets[S_META]);
sysfs_print(alloc_buckets, stats.buckets_alloc);
sysfs_print(available_buckets, dev_buckets_available(ca));
sysfs_print(free_buckets, dev_buckets_free(ca));
sysfs_print(available_buckets, __dev_buckets_available(ca, stats));
sysfs_print(free_buckets, __dev_buckets_free(ca, stats));
if (attr == &sysfs_has_data) {
out += bch2_scnprint_flag_list(out, end - out,
@ -799,7 +801,7 @@ SHOW(bch2_dev)
return out - buf;
}
sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
if (attr == &sysfs_cache_replacement_policy) {
out += bch2_scnprint_string_list(out, end - out,
@ -843,7 +845,7 @@ STORE(bch2_dev)
struct bch_fs *c = ca->fs;
struct bch_member *mi;
sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@ -899,6 +901,9 @@ STORE(bch2_dev)
bch2_tiering_start(c);
}
if (attr == &sysfs_wake_allocator)
bch2_wake_allocator(ca);
return size;
}
SYSFS_OPS(bch2_dev);
@ -942,6 +947,7 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
&sysfs_wake_allocator,
sysfs_pd_controller_files(copy_gc),
NULL

View File

@ -15,20 +15,10 @@
#include <linux/kthread.h>
#include <trace/events/bcachefs.h>
struct tiering_state {
struct bch_tier *tier;
unsigned sectors;
unsigned stripe_size;
unsigned dev_idx;
struct bch_dev *ca;
};
static bool tiering_pred(struct bch_fs *c,
struct bch_tier *tier,
struct bkey_s_c k)
static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
{
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
struct bch_tier *tier = arg;
struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
const struct bch_extent_ptr *ptr;
unsigned replicas = 0;
@ -44,93 +34,21 @@ static bool tiering_pred(struct bch_fs *c,
return replicas < c->opts.data_replicas;
}
return false;
}
static int issue_tiering_move(struct bch_fs *c,
struct bch_tier *tier,
struct moving_context *ctxt,
struct bkey_s_c k)
{
int ret;
ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
if (!ret)
trace_tiering_copy(k.k);
else
trace_tiering_alloc_fail(c, k.k->size);
return ret;
}
/**
* tiering_next_cache - issue a move to write an extent to the next cache
* device in round robin order
*/
static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
{
struct moving_context ctxt;
struct btree_iter iter;
struct bkey_s_c k;
unsigned nr_devices = dev_mask_nr(&tier->devs);
int ret;
if (!nr_devices)
return 0;
trace_tiering_start(c);
bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH);
while (!kthread_should_stop() &&
!bch2_move_ctxt_wait(&ctxt) &&
(k = bch2_btree_iter_peek(&iter)).k &&
!btree_iter_err(k)) {
if (!tiering_pred(c, tier, k))
goto next;
ret = issue_tiering_move(c, tier, &ctxt, k);
if (ret) {
bch2_btree_iter_unlock(&iter);
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
}
next:
bch2_btree_iter_advance_pos(&iter);
//bch2_btree_iter_cond_resched(&iter);
/* unlock before calling moving_context_wait() */
bch2_btree_iter_unlock(&iter);
cond_resched();
}
bch2_btree_iter_unlock(&iter);
bch2_move_ctxt_exit(&ctxt);
trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
return ctxt.sectors_moved;
}
static int bch2_tiering_thread(void *arg)
{
struct bch_tier *tier = arg;
struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
struct io_clock *clock = &c->io_clock[WRITE];
struct bch_dev *ca;
u64 tier_capacity, available_sectors;
u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
unsigned long last;
unsigned i;
unsigned i, nr_devices;
set_freezable();
while (!kthread_should_stop()) {
if (kthread_wait_freezable(c->tiering_enabled &&
dev_mask_nr(&tier->devs)))
(nr_devices = dev_mask_nr(&tier->devs))))
break;
while (1) {
@ -151,7 +69,7 @@ static int bch2_tiering_thread(void *arg)
ca->mi.first_bucket);
available_sectors +=
bucket_to_sector(ca,
dev_buckets_available(ca));
dev_buckets_available(c, ca));
}
rcu_read_unlock();
}
@ -167,7 +85,15 @@ static int bch2_tiering_thread(void *arg)
return 0;
}
read_tiering(c, tier);
bch2_move_data(c, &tier->pd.rate,
SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
&tier->devs,
writepoint_ptr(&tier->wp),
0,
-1,
tiering_pred, tier,
&keys_moved,
&sectors_moved);
}
return 0;

View File

@ -291,13 +291,15 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
while (1) {
u64 delay = bch2_ratelimit_delay(d);
if (delay)
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop())
if (kthread && kthread_should_stop())
return 1;
if (!delay)
@ -434,8 +436,11 @@ size_t bch2_rand_range(size_t max)
{
size_t rand;
if (!max)
return 0;
do {
get_random_bytes(&rand, sizeof(rand));
rand = get_random_long();
rand &= roundup_pow_of_two(max) - 1;
} while (rand >= max);
@ -642,3 +647,129 @@ void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
return vpmalloc(size, gfp_mask);
}
#if 0
void eytzinger1_test(void)
{
unsigned inorder, eytz, size;
pr_info("1 based eytzinger test:");
for (size = 2;
size < 65536;
size++) {
unsigned extra = eytzinger1_extra(size);
if (!(size % 4096))
pr_info("tree size %u", size);
BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
inorder = 1;
eytzinger1_for_each(eytz, size) {
BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
BUG_ON(eytz != eytzinger1_last(size) &&
eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
inorder++;
}
}
}
void eytzinger0_test(void)
{
unsigned inorder, eytz, size;
pr_info("0 based eytzinger test:");
for (size = 1;
size < 65536;
size++) {
unsigned extra = eytzinger0_extra(size);
if (!(size % 4096))
pr_info("tree size %u", size);
BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
inorder = 0;
eytzinger0_for_each(eytz, size) {
BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
BUG_ON(eytz != eytzinger0_last(size) &&
eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
inorder++;
}
}
}
static inline int cmp_u16(const void *_l, const void *_r, size_t size)
{
const u16 *l = _l, *r = _r;
return (*l > *r) - (*r - *l);
}
static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
{
int i, c1 = -1, c2 = -1;
ssize_t r;
r = eytzinger0_find_le(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
if (r >= 0)
c1 = test_array[r];
for (i = 0; i < nr; i++)
if (test_array[i] <= search && test_array[i] > c2)
c2 = test_array[i];
if (c1 != c2) {
eytzinger0_for_each(i, nr)
pr_info("[%3u] = %12u", i, test_array[i]);
pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
i, r, c1, c2);
}
}
void eytzinger0_find_test(void)
{
unsigned i, nr, allocated = 1 << 12;
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
for (nr = 1; nr < allocated; nr++) {
pr_info("testing %u elems", nr);
get_random_bytes(test_array, nr * sizeof(test_array[0]));
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
/* verify array is sorted correctly: */
eytzinger0_for_each(i, nr)
BUG_ON(i != eytzinger0_last(nr) &&
test_array[i] > test_array[eytzinger0_next(i, nr)]);
for (i = 0; i < U16_MAX; i += 1 << 12)
eytzinger0_find_test_val(test_array, nr, i);
for (i = 0; i < nr; i++) {
eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
eytzinger0_find_test_val(test_array, nr, test_array[i]);
eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
}
}
kfree(test_array);
}
#endif

View File

@ -789,4 +789,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
&(_array)[(_pos)], \
sizeof((_array)[0]) * ((_nr) - (_pos)))
#define array_insert_item(_array, _nr, _pos, _new_item) \
do { \
__array_insert_item(_array, _nr, _pos); \
(_nr)++; \
(_array)[(_pos)] = (_new_item); \
} while (0)
#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
do { \
(_nr) -= (_nr_to_remove); \
memmove(&(_array)[(_pos)], \
&(_array)[(_pos) + (_nr_to_remove)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
} while (0)
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
#endif /* _BCACHEFS_UTIL_H */