mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-22 00:00:03 +03:00
Update bcachefs sources to e57b5958cf bcachefs: fix for building in userspace
This commit is contained in:
parent
f2feceddae
commit
ea83a3985d
@ -1 +1 @@
|
||||
192d759a491f50d92c89c2e842639d2307c815a5
|
||||
e57b5958cf4e8530d26f7c36a6e1427fb284cc70
|
||||
|
@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c,
|
||||
if (ret)
|
||||
die("error reserving space in new filesystem: %s", strerror(-ret));
|
||||
|
||||
bch2_write_op_init(&op, c, res, NULL, 0,
|
||||
bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
|
||||
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
|
||||
closure_call(&op.cl, bch2_write, NULL, &cl);
|
||||
closure_sync(&cl);
|
||||
|
@ -98,23 +98,6 @@ DECLARE_EVENT_CLASS(bio,
|
||||
(unsigned long long)__entry->sector, __entry->nr_sector)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(page_alloc_fail,
|
||||
TP_PROTO(struct bch_fs *c, u64 size),
|
||||
TP_ARGS(c, size),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
__field(u64, size )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
||||
__entry->size = size;
|
||||
),
|
||||
|
||||
TP_printk("%pU size %llu", __entry->uuid, __entry->size)
|
||||
);
|
||||
|
||||
/* io.c: */
|
||||
|
||||
DEFINE_EVENT(bio, read_split,
|
||||
@ -137,34 +120,6 @@ DEFINE_EVENT(bio, promote,
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
TRACE_EVENT(write_throttle,
|
||||
TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay),
|
||||
TP_ARGS(c, inode, bio, delay),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
__field(u64, inode )
|
||||
__field(sector_t, sector )
|
||||
__field(unsigned int, nr_sector )
|
||||
__array(char, rwbs, 6 )
|
||||
__field(u64, delay )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
||||
__entry->inode = inode;
|
||||
__entry->sector = bio->bi_iter.bi_sector;
|
||||
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
|
||||
blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
|
||||
__entry->delay = delay;
|
||||
),
|
||||
|
||||
TP_printk("%pU inode %llu %s %llu + %u delay %llu",
|
||||
__entry->uuid, __entry->inode,
|
||||
__entry->rwbs, (unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->delay)
|
||||
);
|
||||
|
||||
/* Journal */
|
||||
|
||||
DEFINE_EVENT(bch_fs, journal_full,
|
||||
@ -439,16 +394,6 @@ TRACE_EVENT(alloc_batch,
|
||||
__entry->uuid, __entry->free, __entry->total)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bch_dev, prio_write_start,
|
||||
TP_PROTO(struct bch_dev *ca),
|
||||
TP_ARGS(ca)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bch_dev, prio_write_end,
|
||||
TP_PROTO(struct bch_dev *ca),
|
||||
TP_ARGS(ca)
|
||||
);
|
||||
|
||||
TRACE_EVENT(invalidate,
|
||||
TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
|
||||
TP_ARGS(ca, offset, sectors),
|
||||
@ -502,153 +447,31 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
|
||||
TP_ARGS(ca, reserve)
|
||||
);
|
||||
|
||||
TRACE_EVENT(freelist_empty_fail,
|
||||
TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve,
|
||||
struct closure *cl),
|
||||
TP_ARGS(c, reserve, cl),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
__field(enum alloc_reserve, reserve )
|
||||
__field(struct closure *, cl )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
||||
__entry->reserve = reserve;
|
||||
__entry->cl = cl;
|
||||
),
|
||||
|
||||
TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve,
|
||||
__entry->cl)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(open_bucket_alloc,
|
||||
TP_PROTO(struct bch_fs *c, struct closure *cl),
|
||||
TP_ARGS(c, cl),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
__field(struct closure *, cl )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
||||
__entry->cl = cl;
|
||||
),
|
||||
|
||||
TP_printk("%pU cl %p",
|
||||
__entry->uuid, __entry->cl)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc,
|
||||
TP_PROTO(struct bch_fs *c, struct closure *cl),
|
||||
TP_ARGS(c, cl)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail,
|
||||
TP_PROTO(struct bch_fs *c, struct closure *cl),
|
||||
TP_ARGS(c, cl)
|
||||
DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
|
||||
TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
|
||||
TP_ARGS(ca, reserve)
|
||||
);
|
||||
|
||||
/* Moving IO */
|
||||
|
||||
DECLARE_EVENT_CLASS(moving_io,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(__u32, inode )
|
||||
__field(__u64, offset )
|
||||
__field(__u32, sectors )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->inode = k->p.inode;
|
||||
__entry->offset = k->p.offset;
|
||||
__entry->sectors = k->size;
|
||||
),
|
||||
|
||||
TP_printk("%u:%llu sectors %u",
|
||||
__entry->inode, __entry->offset, __entry->sectors)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(moving_io, move_read,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(moving_io, move_read_done,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(moving_io, move_write,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(moving_io, copy_collision,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
/* Copy GC */
|
||||
|
||||
DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail,
|
||||
TP_PROTO(struct bch_fs *c, u64 size),
|
||||
TP_ARGS(c, size)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bch_dev, moving_gc_start,
|
||||
TP_PROTO(struct bch_dev *ca),
|
||||
TP_ARGS(ca)
|
||||
);
|
||||
|
||||
TRACE_EVENT(moving_gc_end,
|
||||
TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved,
|
||||
u64 buckets_moved),
|
||||
TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
__field(u64, sectors_moved )
|
||||
__field(u64, keys_moved )
|
||||
__field(u64, buckets_moved )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, ca->uuid.b, 16);
|
||||
__entry->sectors_moved = sectors_moved;
|
||||
__entry->keys_moved = keys_moved;
|
||||
__entry->buckets_moved = buckets_moved;
|
||||
),
|
||||
|
||||
TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu",
|
||||
__entry->uuid, __entry->sectors_moved, __entry->keys_moved,
|
||||
__entry->buckets_moved)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bkey, gc_copy,
|
||||
DEFINE_EVENT(bkey, move_extent,
|
||||
TP_PROTO(const struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
/* Tiering */
|
||||
|
||||
DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail,
|
||||
TP_PROTO(struct bch_fs *c, u64 size),
|
||||
TP_ARGS(c, size)
|
||||
DEFINE_EVENT(bkey, move_alloc_fail,
|
||||
TP_PROTO(const struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bch_fs, tiering_start,
|
||||
TP_PROTO(struct bch_fs *c),
|
||||
TP_ARGS(c)
|
||||
DEFINE_EVENT(bkey, move_race,
|
||||
TP_PROTO(const struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
TRACE_EVENT(tiering_end,
|
||||
TRACE_EVENT(move_data,
|
||||
TP_PROTO(struct bch_fs *c, u64 sectors_moved,
|
||||
u64 keys_moved),
|
||||
u64 keys_moved),
|
||||
TP_ARGS(c, sectors_moved, keys_moved),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
@ -667,9 +490,34 @@ TRACE_EVENT(tiering_end,
|
||||
__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bkey, tiering_copy,
|
||||
TP_PROTO(const struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
TRACE_EVENT(copygc,
|
||||
TP_PROTO(struct bch_dev *ca,
|
||||
u64 sectors_moved, u64 sectors_not_moved,
|
||||
u64 buckets_moved, u64 buckets_not_moved),
|
||||
TP_ARGS(ca,
|
||||
sectors_moved, sectors_not_moved,
|
||||
buckets_moved, buckets_not_moved),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
__field(u64, sectors_moved )
|
||||
__field(u64, sectors_not_moved )
|
||||
__field(u64, buckets_moved )
|
||||
__field(u64, buckets_not_moved )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, ca->uuid.b, 16);
|
||||
__entry->sectors_moved = sectors_moved;
|
||||
__entry->sectors_not_moved = sectors_not_moved;
|
||||
__entry->buckets_moved = buckets_moved;
|
||||
__entry->buckets_not_moved = buckets_moved;
|
||||
),
|
||||
|
||||
TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
|
||||
__entry->uuid,
|
||||
__entry->sectors_moved, __entry->sectors_not_moved,
|
||||
__entry->buckets_moved, __entry->buckets_not_moved)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_BCACHE_H */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,7 @@ struct bkey;
|
||||
struct bucket;
|
||||
struct bch_dev;
|
||||
struct bch_fs;
|
||||
struct dev_group;
|
||||
struct bch_devs_List;
|
||||
|
||||
struct dev_alloc_list {
|
||||
unsigned nr;
|
||||
@ -24,33 +24,61 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
|
||||
int bch2_alloc_read(struct bch_fs *, struct list_head *);
|
||||
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
|
||||
|
||||
long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
|
||||
enum bucket_alloc_ret {
|
||||
ALLOC_SUCCESS = 0,
|
||||
OPEN_BUCKETS_EMPTY = -1,
|
||||
FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
|
||||
NO_DEVICES = -3, /* -EROFS */
|
||||
};
|
||||
|
||||
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
||||
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
|
||||
struct closure *);
|
||||
|
||||
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
||||
|
||||
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
if (atomic_dec_and_test(&ob->pin))
|
||||
__bch2_open_bucket_put(c, ob);
|
||||
}
|
||||
|
||||
static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < *nr; i++)
|
||||
bch2_open_bucket_put(c, c->open_buckets + refs[i]);
|
||||
|
||||
*nr = 0;
|
||||
}
|
||||
|
||||
static inline void bch2_open_bucket_get(struct bch_fs *c,
|
||||
struct write_point *wp,
|
||||
u8 *nr, u8 *refs)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
|
||||
struct open_bucket *ob = wp->ptrs[i];
|
||||
|
||||
atomic_inc(&ob->pin);
|
||||
refs[(*nr)++] = ob - c->open_buckets;
|
||||
}
|
||||
}
|
||||
|
||||
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
|
||||
enum bch_data_type,
|
||||
struct bch_devs_mask *,
|
||||
unsigned long,
|
||||
struct write_point_specifier,
|
||||
struct bch_devs_list *,
|
||||
unsigned, unsigned,
|
||||
enum alloc_reserve,
|
||||
unsigned,
|
||||
struct closure *);
|
||||
|
||||
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
|
||||
unsigned, struct open_bucket *, unsigned);
|
||||
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
|
||||
struct bkey_i_extent *, unsigned);
|
||||
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
|
||||
|
||||
struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
|
||||
enum bch_data_type,
|
||||
struct bch_devs_mask *,
|
||||
unsigned long,
|
||||
struct bkey_i_extent *,
|
||||
unsigned, unsigned,
|
||||
enum alloc_reserve,
|
||||
unsigned,
|
||||
struct closure *);
|
||||
|
||||
static inline void bch2_wake_allocator(struct bch_dev *ca)
|
||||
{
|
||||
struct task_struct *p;
|
||||
@ -61,10 +89,20 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
#define open_bucket_for_each_ptr(_ob, _ptr) \
|
||||
for ((_ptr) = (_ob)->ptrs; \
|
||||
(_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \
|
||||
(_ptr)++)
|
||||
#define writepoint_for_each_ptr(_wp, _ob, _i) \
|
||||
for ((_i) = 0; \
|
||||
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
|
||||
(_i)++)
|
||||
|
||||
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
|
||||
{
|
||||
return (struct write_point_specifier) { .v = v | 1 };
|
||||
}
|
||||
|
||||
static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
|
||||
{
|
||||
return (struct write_point_specifier) { .v = (unsigned long) wp };
|
||||
}
|
||||
|
||||
void bch2_recalc_capacity(struct bch_fs *);
|
||||
|
||||
@ -74,6 +112,13 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_dev_allocator_stop(struct bch_dev *);
|
||||
int bch2_dev_allocator_start(struct bch_dev *);
|
||||
|
||||
static inline void writepoint_init(struct write_point *wp,
|
||||
enum bch_data_type type)
|
||||
{
|
||||
mutex_init(&wp->lock);
|
||||
wp->type = type;
|
||||
}
|
||||
|
||||
void bch2_fs_allocator_init(struct bch_fs *);
|
||||
|
||||
extern const struct bkey_ops bch2_bkey_alloc_ops;
|
||||
|
@ -47,19 +47,14 @@ enum alloc_reserve {
|
||||
#define OPEN_BUCKETS_COUNT 256
|
||||
#define WRITE_POINT_COUNT 32
|
||||
|
||||
struct open_bucket_ptr {
|
||||
struct bch_extent_ptr ptr;
|
||||
unsigned sectors_free;
|
||||
};
|
||||
|
||||
struct open_bucket {
|
||||
spinlock_t lock;
|
||||
atomic_t pin;
|
||||
u8 freelist;
|
||||
u8 new_ob;
|
||||
u8 nr_ptrs;
|
||||
|
||||
struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2];
|
||||
bool valid;
|
||||
bool on_partial_list;
|
||||
unsigned sectors_free;
|
||||
struct bch_extent_ptr ptr;
|
||||
};
|
||||
|
||||
struct write_point {
|
||||
@ -69,13 +64,23 @@ struct write_point {
|
||||
unsigned long write_point;
|
||||
enum bch_data_type type;
|
||||
|
||||
u8 nr_ptrs;
|
||||
/*
|
||||
* number of pointers in @ob we can't use, because we already had
|
||||
* pointers to those devices:
|
||||
*/
|
||||
u8 nr_ptrs_can_use;
|
||||
/* calculated based on how many pointers we're actually going to use: */
|
||||
unsigned sectors_free;
|
||||
|
||||
struct open_bucket *ob;
|
||||
struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
|
||||
u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
||||
};
|
||||
|
||||
struct write_point_specifier {
|
||||
unsigned long v;
|
||||
};
|
||||
|
||||
struct alloc_heap_entry {
|
||||
size_t bucket;
|
||||
unsigned long key;
|
||||
|
@ -251,9 +251,6 @@ do { \
|
||||
BCH_DEBUG_PARAM(debug_check_bkeys, \
|
||||
"Run bkey_debugcheck (primarily checking GC/allocation "\
|
||||
"information) when iterating over keys") \
|
||||
BCH_DEBUG_PARAM(version_stress_test, \
|
||||
"Assigns random version numbers to newly written " \
|
||||
"extents, to test overlapping extent cases") \
|
||||
BCH_DEBUG_PARAM(verify_btree_ondisk, \
|
||||
"Reread btree nodes at various points to verify the " \
|
||||
"mergesort in the read path against modifications " \
|
||||
@ -310,8 +307,9 @@ struct crypto_blkcipher;
|
||||
struct crypto_ahash;
|
||||
|
||||
enum gc_phase {
|
||||
GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
|
||||
GC_PHASE_SB = BTREE_ID_NR + 1,
|
||||
GC_PHASE_PENDING_DELETE,
|
||||
GC_PHASE_ALLOC,
|
||||
GC_PHASE_DONE
|
||||
};
|
||||
|
||||
@ -321,30 +319,6 @@ struct gc_pos {
|
||||
unsigned level;
|
||||
};
|
||||
|
||||
struct bch_member_cpu {
|
||||
u64 nbuckets; /* device size */
|
||||
u16 first_bucket; /* index of first bucket used */
|
||||
u16 bucket_size; /* sectors */
|
||||
u8 state;
|
||||
u8 tier;
|
||||
u8 replacement;
|
||||
u8 discard;
|
||||
u8 data_allowed;
|
||||
u8 valid;
|
||||
};
|
||||
|
||||
struct bch_replicas_cpu_entry {
|
||||
u8 data_type;
|
||||
u8 devs[BCH_SB_MEMBERS_MAX / 8];
|
||||
};
|
||||
|
||||
struct bch_replicas_cpu {
|
||||
struct rcu_head rcu;
|
||||
unsigned nr;
|
||||
unsigned entry_size;
|
||||
struct bch_replicas_cpu_entry entries[];
|
||||
};
|
||||
|
||||
struct io_count {
|
||||
u64 sectors[2][BCH_DATA_NR];
|
||||
};
|
||||
@ -372,7 +346,7 @@ struct bch_dev {
|
||||
|
||||
struct bch_devs_mask self;
|
||||
|
||||
/* biosets used in cloned bios for replicas and moving_gc */
|
||||
/* biosets used in cloned bios for writing multiple replicas */
|
||||
struct bio_set replica_set;
|
||||
|
||||
struct task_struct *alloc_thread;
|
||||
@ -392,7 +366,7 @@ struct bch_dev {
|
||||
unsigned nr_invalidated;
|
||||
bool alloc_thread_started;
|
||||
|
||||
struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
|
||||
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
|
||||
unsigned open_buckets_partial_nr;
|
||||
|
||||
size_t fifo_last_bucket;
|
||||
@ -422,18 +396,20 @@ struct bch_dev {
|
||||
bool allocator_invalidating_data;
|
||||
|
||||
alloc_heap alloc_heap;
|
||||
bucket_heap copygc_heap;
|
||||
|
||||
/* Moving GC: */
|
||||
struct task_struct *moving_gc_read;
|
||||
|
||||
struct bch_pd_controller moving_gc_pd;
|
||||
/* Copying GC: */
|
||||
struct task_struct *copygc_thread;
|
||||
copygc_heap copygc_heap;
|
||||
struct bch_pd_controller copygc_pd;
|
||||
struct write_point copygc_write_point;
|
||||
|
||||
struct journal_device journal;
|
||||
|
||||
struct work_struct io_error_work;
|
||||
|
||||
/* The rest of this all shows up in sysfs */
|
||||
atomic_t latency[2];
|
||||
|
||||
struct io_count __percpu *io_done;
|
||||
};
|
||||
|
||||
@ -473,6 +449,7 @@ struct bch_tier {
|
||||
struct bch_pd_controller pd;
|
||||
|
||||
struct bch_devs_mask devs;
|
||||
struct write_point wp;
|
||||
};
|
||||
|
||||
enum bch_fs_state {
|
||||
@ -557,10 +534,7 @@ struct bch_fs {
|
||||
* when allocating btree reserves fail halfway through) - instead, we
|
||||
* can stick them here:
|
||||
*/
|
||||
struct btree_alloc {
|
||||
struct open_bucket *ob;
|
||||
BKEY_PADDED(k);
|
||||
} btree_reserve_cache[BTREE_NODE_RESERVE * 2];
|
||||
struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
|
||||
unsigned btree_reserve_cache_nr;
|
||||
struct mutex btree_reserve_cache_lock;
|
||||
|
||||
@ -573,15 +547,9 @@ struct bch_fs {
|
||||
struct workqueue_struct *copygc_wq;
|
||||
|
||||
/* ALLOCATION */
|
||||
struct rw_semaphore alloc_gc_lock;
|
||||
struct bch_pd_controller foreground_write_pd;
|
||||
struct delayed_work pd_controllers_update;
|
||||
unsigned pd_controllers_update_seconds;
|
||||
spinlock_t foreground_write_pd_lock;
|
||||
struct bch_write_op *write_wait_head;
|
||||
struct bch_write_op *write_wait_tail;
|
||||
|
||||
struct timer_list foreground_write_wakeup;
|
||||
|
||||
/*
|
||||
* These contain all r/w devices - i.e. devices we can currently
|
||||
@ -622,8 +590,8 @@ struct bch_fs {
|
||||
|
||||
struct io_clock io_clock[2];
|
||||
|
||||
/* SECTOR ALLOCATOR */
|
||||
spinlock_t open_buckets_lock;
|
||||
/* ALLOCATOR */
|
||||
spinlock_t freelist_lock;
|
||||
u8 open_buckets_freelist;
|
||||
u8 open_buckets_nr_free;
|
||||
struct closure_waitlist open_buckets_wait;
|
||||
@ -635,15 +603,6 @@ struct bch_fs {
|
||||
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
|
||||
struct mutex write_points_hash_lock;
|
||||
|
||||
/*
|
||||
* This write point is used for migrating data off a device
|
||||
* and can point to any other device.
|
||||
* We can't use the normal write points because those will
|
||||
* gang up n replicas, and for migration we want only one new
|
||||
* replica.
|
||||
*/
|
||||
struct write_point migration_write_point;
|
||||
|
||||
/* GARBAGE COLLECTION */
|
||||
struct task_struct *gc_thread;
|
||||
atomic_t kick_gc;
|
||||
@ -688,6 +647,11 @@ struct bch_fs {
|
||||
|
||||
atomic64_t key_version;
|
||||
|
||||
/* VFS IO PATH - fs-io.c */
|
||||
struct bio_set writepage_bioset;
|
||||
struct bio_set dio_write_bioset;
|
||||
struct bio_set dio_read_bioset;
|
||||
|
||||
struct bio_list btree_write_error_list;
|
||||
struct work_struct btree_write_error_work;
|
||||
spinlock_t btree_write_error_lock;
|
||||
@ -728,19 +692,14 @@ struct bch_fs {
|
||||
|
||||
/* The rest of this all shows up in sysfs */
|
||||
atomic_long_t read_realloc_races;
|
||||
atomic_long_t extent_migrate_done;
|
||||
atomic_long_t extent_migrate_raced;
|
||||
|
||||
unsigned btree_gc_periodic:1;
|
||||
unsigned foreground_write_ratelimit_enabled:1;
|
||||
unsigned copy_gc_enabled:1;
|
||||
unsigned tiering_enabled:1;
|
||||
unsigned tiering_percent;
|
||||
|
||||
/*
|
||||
* foreground writes will be throttled when the number of free
|
||||
* buckets is below this percentage
|
||||
*/
|
||||
unsigned foreground_target_percent;
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) bool name;
|
||||
BCH_DEBUG_PARAMS_ALL()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
@ -344,11 +344,13 @@ struct bch_csum {
|
||||
|
||||
enum bch_csum_type {
|
||||
BCH_CSUM_NONE = 0,
|
||||
BCH_CSUM_CRC32C = 1,
|
||||
BCH_CSUM_CRC64 = 2,
|
||||
BCH_CSUM_CRC32C_NONZERO = 1,
|
||||
BCH_CSUM_CRC64_NONZERO = 2,
|
||||
BCH_CSUM_CHACHA20_POLY1305_80 = 3,
|
||||
BCH_CSUM_CHACHA20_POLY1305_128 = 4,
|
||||
BCH_CSUM_NR = 5,
|
||||
BCH_CSUM_CRC32C = 5,
|
||||
BCH_CSUM_CRC64 = 6,
|
||||
BCH_CSUM_NR = 7,
|
||||
};
|
||||
|
||||
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
|
||||
@ -550,7 +552,7 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION);
|
||||
/* Maximum possible size of an entire extent value: */
|
||||
/* There's a hack in the keylist code that needs to be fixed.. */
|
||||
#define BKEY_EXTENT_VAL_U64s_MAX \
|
||||
(BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
|
||||
(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
|
||||
|
||||
/* * Maximum possible size of an entire extent, key + value: */
|
||||
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
|
||||
@ -734,11 +736,13 @@ BKEY_VAL_TYPE(alloc, BCH_ALLOC);
|
||||
/*
|
||||
* Version 8: BCH_SB_ENCODED_EXTENT_MAX_BITS
|
||||
* BCH_MEMBER_DATA_ALLOWED
|
||||
* Version 9: incompatible extent nonce change
|
||||
*/
|
||||
|
||||
#define BCH_SB_VERSION_MIN 7
|
||||
#define BCH_SB_VERSION_EXTENT_MAX 8
|
||||
#define BCH_SB_VERSION_MAX 8
|
||||
#define BCH_SB_VERSION_EXTENT_NONCE_V1 9
|
||||
#define BCH_SB_VERSION_MAX 9
|
||||
|
||||
#define BCH_SB_SECTOR 8
|
||||
#define BCH_SB_LABEL_SIZE 32
|
||||
|
@ -4,6 +4,14 @@
|
||||
#include "bset.h"
|
||||
#include "util.h"
|
||||
|
||||
#undef EBUG_ON
|
||||
|
||||
#ifdef DEBUG_BKEYS
|
||||
#define EBUG_ON(cond) BUG_ON(cond)
|
||||
#else
|
||||
#define EBUG_ON(cond)
|
||||
#endif
|
||||
|
||||
const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
|
||||
|
||||
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
|
||||
|
@ -146,6 +146,17 @@
|
||||
* first key in that range of bytes again.
|
||||
*/
|
||||
|
||||
extern bool bch2_expensive_debug_checks;
|
||||
|
||||
static inline bool btree_keys_expensive_checks(const struct btree *b)
|
||||
{
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
return bch2_expensive_debug_checks || *b->expensive_debug_checks;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
struct btree_node_iter;
|
||||
struct btree_node_iter_set;
|
||||
|
||||
@ -188,7 +199,7 @@ bkey_unpack_key_format_checked(const struct btree *b,
|
||||
compiled_unpack_fn unpack_fn = b->aux_data;
|
||||
unpack_fn(&dst, src);
|
||||
|
||||
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
|
||||
if (btree_keys_expensive_checks(b)) {
|
||||
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
|
||||
|
||||
/*
|
||||
@ -260,17 +271,6 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
|
||||
#define for_each_bset(_b, _t) \
|
||||
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
|
||||
|
||||
extern bool bch2_expensive_debug_checks;
|
||||
|
||||
static inline bool btree_keys_expensive_checks(struct btree *b)
|
||||
{
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
return bch2_expensive_debug_checks || *b->expensive_debug_checks;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
|
||||
{
|
||||
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
@ -111,19 +112,35 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
|
||||
/*
|
||||
* For runtime mark and sweep:
|
||||
*/
|
||||
static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c k, unsigned flags)
|
||||
static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c k, unsigned flags)
|
||||
{
|
||||
struct gc_pos pos = { 0 };
|
||||
struct bch_fs_usage *stats;
|
||||
u8 ret = 0;
|
||||
|
||||
preempt_disable();
|
||||
stats = this_cpu_ptr(c->usage_percpu);
|
||||
switch (type) {
|
||||
case BKEY_TYPE_BTREE:
|
||||
bch2_gc_mark_key(c, k, c->opts.btree_node_size, true, flags);
|
||||
return 0;
|
||||
bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
|
||||
0, flags|
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
break;
|
||||
case BKEY_TYPE_EXTENTS:
|
||||
bch2_gc_mark_key(c, k, k.k->size, false, flags);
|
||||
return bch2_btree_key_recalc_oldest_gen(c, k);
|
||||
bch2_mark_key(c, k, k.k->size, false, pos, stats,
|
||||
0, flags|
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
ret = bch2_btree_key_recalc_oldest_gen(c, k);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
@ -182,7 +199,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
|
||||
max_t(u64, k.k->version.lo,
|
||||
atomic64_read(&c->key_version)));
|
||||
|
||||
bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
|
||||
bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -200,7 +217,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
|
||||
btree_node_is_extents(b),
|
||||
&unpacked) {
|
||||
bch2_bkey_debugcheck(c, b, k);
|
||||
stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
|
||||
stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
|
||||
}
|
||||
|
||||
return stale;
|
||||
@ -267,123 +284,79 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
|
||||
mutex_lock(&c->btree_root_lock);
|
||||
|
||||
b = c->btree_roots[btree_id].b;
|
||||
bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
|
||||
bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
|
||||
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
|
||||
|
||||
mutex_unlock(&c->btree_root_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bch2_mark_allocator_buckets(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
struct open_bucket *ob;
|
||||
const struct open_bucket_ptr *ptr;
|
||||
size_t i, j, iter;
|
||||
unsigned ci;
|
||||
|
||||
down_write(&c->alloc_gc_lock);
|
||||
|
||||
for_each_member_device(ca, c, ci) {
|
||||
spin_lock(&ca->freelist_lock);
|
||||
|
||||
fifo_for_each_entry(i, &ca->free_inc, iter)
|
||||
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
|
||||
|
||||
for (j = 0; j < RESERVE_NR; j++)
|
||||
fifo_for_each_entry(i, &ca->free[j], iter)
|
||||
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
|
||||
|
||||
for (ptr = ca->open_buckets_partial;
|
||||
ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
|
||||
ptr++)
|
||||
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
|
||||
|
||||
spin_unlock(&ca->freelist_lock);
|
||||
}
|
||||
|
||||
for (ob = c->open_buckets;
|
||||
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
||||
ob++) {
|
||||
spin_lock(&ob->lock);
|
||||
open_bucket_for_each_ptr(ob, ptr) {
|
||||
ca = c->devs[ptr->ptr.dev];
|
||||
bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
|
||||
}
|
||||
spin_unlock(&ob->lock);
|
||||
}
|
||||
|
||||
up_write(&c->alloc_gc_lock);
|
||||
}
|
||||
|
||||
static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
|
||||
enum bucket_data_type type)
|
||||
static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
|
||||
u64 start, u64 end,
|
||||
enum bucket_data_type type,
|
||||
unsigned flags)
|
||||
{
|
||||
u64 b = sector_to_bucket(ca, start);
|
||||
|
||||
do {
|
||||
bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true);
|
||||
bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type,
|
||||
gc_phase(GC_PHASE_SB), flags);
|
||||
b++;
|
||||
} while (b < sector_to_bucket(ca, end));
|
||||
}
|
||||
|
||||
static void bch2_dev_mark_superblocks(struct bch_dev *ca)
|
||||
void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < layout->nr_superblocks; i++) {
|
||||
if (layout->sb_offset[i] == BCH_SB_SECTOR)
|
||||
mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
|
||||
BUCKET_SB);
|
||||
|
||||
mark_metadata_sectors(ca,
|
||||
layout->sb_offset[i],
|
||||
layout->sb_offset[i] +
|
||||
(1 << layout->sb_max_size_bits),
|
||||
BUCKET_SB);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark non btree metadata - prios, journal
|
||||
*/
|
||||
void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
unsigned i;
|
||||
u64 b;
|
||||
|
||||
lockdep_assert_held(&c->sb_lock);
|
||||
|
||||
bch2_dev_mark_superblocks(ca);
|
||||
for (i = 0; i < layout->nr_superblocks; i++) {
|
||||
if (layout->sb_offset[i] == BCH_SB_SECTOR)
|
||||
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
|
||||
BUCKET_SB, flags);
|
||||
|
||||
mark_metadata_sectors(c, ca,
|
||||
layout->sb_offset[i],
|
||||
layout->sb_offset[i] +
|
||||
(1 << layout->sb_max_size_bits),
|
||||
BUCKET_SB, flags);
|
||||
}
|
||||
|
||||
spin_lock(&c->journal.lock);
|
||||
|
||||
for (i = 0; i < ca->journal.nr; i++) {
|
||||
b = ca->journal.buckets[i];
|
||||
bch2_mark_metadata_bucket(ca, ca->buckets + b,
|
||||
BUCKET_JOURNAL, true);
|
||||
bch2_mark_metadata_bucket(c, ca, ca->buckets + b,
|
||||
BUCKET_JOURNAL,
|
||||
gc_phase(GC_PHASE_SB), flags);
|
||||
}
|
||||
|
||||
spin_unlock(&c->journal.lock);
|
||||
}
|
||||
|
||||
static void bch2_mark_metadata(struct bch_fs *c)
|
||||
static void bch2_mark_superblocks(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
|
||||
gc_pos_set(c, gc_phase(GC_PHASE_SB));
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
bch2_mark_dev_metadata(c, ca);
|
||||
bch2_mark_dev_superblock(c, ca,
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
||||
/* Also see bch2_pending_btree_node_free_insert_done() */
|
||||
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
|
||||
{
|
||||
struct gc_pos pos = { 0 };
|
||||
struct bch_fs_usage stats = { 0 };
|
||||
struct btree_update *as;
|
||||
struct pending_btree_node_free *d;
|
||||
@ -393,10 +366,11 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
|
||||
|
||||
for_each_pending_btree_node_free(c, as, d)
|
||||
if (d->index_update_done)
|
||||
__bch2_mark_key(c, bkey_i_to_s_c(&d->key),
|
||||
c->opts.btree_node_size, true,
|
||||
&stats, 0,
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
|
||||
bch2_mark_key(c, bkey_i_to_s_c(&d->key),
|
||||
c->opts.btree_node_size, true, pos,
|
||||
&stats, 0,
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
/*
|
||||
* Don't apply stats - pending deletes aren't tracked in
|
||||
* bch_alloc_stats:
|
||||
@ -405,6 +379,51 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
}
|
||||
|
||||
static void bch2_mark_allocator_buckets(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
struct open_bucket *ob;
|
||||
size_t i, j, iter;
|
||||
unsigned ci;
|
||||
|
||||
spin_lock(&c->freelist_lock);
|
||||
gc_pos_set(c, gc_pos_alloc(c, NULL));
|
||||
|
||||
for_each_member_device(ca, c, ci) {
|
||||
fifo_for_each_entry(i, &ca->free_inc, iter)
|
||||
bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
|
||||
gc_pos_alloc(c, NULL),
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
|
||||
|
||||
|
||||
for (j = 0; j < RESERVE_NR; j++)
|
||||
fifo_for_each_entry(i, &ca->free[j], iter)
|
||||
bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
|
||||
gc_pos_alloc(c, NULL),
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
}
|
||||
|
||||
spin_unlock(&c->freelist_lock);
|
||||
|
||||
for (ob = c->open_buckets;
|
||||
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
||||
ob++) {
|
||||
spin_lock(&ob->lock);
|
||||
if (ob->valid) {
|
||||
gc_pos_set(c, gc_pos_alloc(c, ob));
|
||||
ca = c->devs[ob->ptr.dev];
|
||||
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
|
||||
gc_pos_alloc(c, ob),
|
||||
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
|
||||
BCH_BUCKET_MARK_GC_LOCK_HELD);
|
||||
}
|
||||
spin_unlock(&ob->lock);
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_gc_start(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
@ -495,9 +514,6 @@ void bch2_gc(struct bch_fs *c)
|
||||
|
||||
bch2_gc_start(c);
|
||||
|
||||
/* Walk allocator's references: */
|
||||
bch2_mark_allocator_buckets(c);
|
||||
|
||||
/* Walk btree: */
|
||||
while (c->gc_pos.phase < (int) BTREE_ID_NR) {
|
||||
int ret = c->btree_roots[c->gc_pos.phase].b
|
||||
@ -513,8 +529,9 @@ void bch2_gc(struct bch_fs *c)
|
||||
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
|
||||
}
|
||||
|
||||
bch2_mark_metadata(c);
|
||||
bch2_mark_superblocks(c);
|
||||
bch2_mark_pending_btree_node_frees(c);
|
||||
bch2_mark_allocator_buckets(c);
|
||||
|
||||
for_each_member_device(ca, c, i)
|
||||
atomic_long_set(&ca->saturated_count, 0);
|
||||
@ -570,7 +587,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
|
||||
struct bkey_format new_format;
|
||||
|
||||
memset(new_nodes, 0, sizeof(new_nodes));
|
||||
bch2_keylist_init(&keylist, NULL, 0);
|
||||
bch2_keylist_init(&keylist, NULL);
|
||||
|
||||
/* Count keys that are not deleted */
|
||||
for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
|
||||
@ -1023,8 +1040,6 @@ again:
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_mark_metadata(c);
|
||||
|
||||
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
|
||||
if (iter++ > 2) {
|
||||
bch_info(c, "Unable to fix bucket gens, looping");
|
||||
@ -1043,6 +1058,8 @@ again:
|
||||
if (c->sb.encryption_type)
|
||||
atomic64_add(1 << 16, &c->key_version);
|
||||
|
||||
bch2_mark_superblocks(c);
|
||||
|
||||
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
|
||||
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
|
||||
|
||||
|
@ -13,7 +13,7 @@ int bch2_initial_gc(struct bch_fs *, struct list_head *);
|
||||
u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
|
||||
int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
|
||||
struct bkey_s_c);
|
||||
void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
|
||||
|
||||
/*
|
||||
* For concurrent mark and sweep (with other index updates), we define a total
|
||||
@ -88,6 +88,14 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
|
||||
};
|
||||
}
|
||||
|
||||
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
return (struct gc_pos) {
|
||||
.phase = GC_PHASE_ALLOC,
|
||||
.pos = POS(ob ? ob - c->open_buckets : 0, 0),
|
||||
};
|
||||
}
|
||||
|
||||
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
|
||||
{
|
||||
unsigned seq;
|
||||
|
@ -146,9 +146,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
|
||||
BUG_ON(iter->data->k > iter->data->end);
|
||||
|
||||
if (iter->data->k == iter->data->end)
|
||||
memmove(&iter->data[0],
|
||||
&iter->data[1],
|
||||
sizeof(iter->data[0]) * --iter->used);
|
||||
array_remove_item(iter->data, iter->used, 0);
|
||||
else
|
||||
sort_iter_sift(iter, cmp);
|
||||
}
|
||||
@ -1307,6 +1305,8 @@ static void btree_node_read_endio(struct bio *bio)
|
||||
struct btree_read_bio *rb =
|
||||
container_of(bio, struct btree_read_bio, bio);
|
||||
|
||||
bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
|
||||
|
||||
INIT_WORK(&rb->work, btree_node_read_work);
|
||||
schedule_work(&rb->work);
|
||||
}
|
||||
@ -1471,6 +1471,8 @@ static void btree_node_write_endio(struct bio *bio)
|
||||
struct bch_fs *c = wbio->c;
|
||||
struct bch_dev *ca = wbio->ca;
|
||||
|
||||
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
|
||||
|
||||
if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
|
||||
bch2_meta_write_fault("btree"))
|
||||
set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
|
||||
|
@ -10,6 +10,7 @@ struct btree_iter;
|
||||
|
||||
struct btree_read_bio {
|
||||
struct bch_fs *c;
|
||||
unsigned submit_time_us;
|
||||
u64 start_time;
|
||||
struct extent_pick_ptr pick;
|
||||
struct work_struct work;
|
||||
|
@ -91,7 +91,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
|
||||
{
|
||||
int lock_type = btree_node_locked_type(iter, level);
|
||||
|
||||
EBUG_ON(iter->flags & BTREE_ITER_UPTODATE);
|
||||
EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
|
||||
|
||||
if (lock_type != BTREE_NODE_UNLOCKED)
|
||||
six_unlock_type(&iter->nodes[level]->lock, lock_type);
|
||||
|
@ -55,6 +55,16 @@ struct btree_write {
|
||||
struct closure_waitlist wait;
|
||||
};
|
||||
|
||||
struct btree_ob_ref {
|
||||
u8 nr;
|
||||
u8 refs[BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
struct btree_alloc {
|
||||
struct btree_ob_ref ob;
|
||||
BKEY_PADDED(k);
|
||||
};
|
||||
|
||||
struct btree {
|
||||
/* Hottest entries first */
|
||||
struct rhash_head hash;
|
||||
@ -118,7 +128,7 @@ struct btree {
|
||||
*/
|
||||
struct btree_update *will_make_reachable;
|
||||
|
||||
struct open_bucket *ob;
|
||||
struct btree_ob_ref ob;
|
||||
|
||||
/* lru list */
|
||||
struct list_head list;
|
||||
@ -317,18 +327,6 @@ struct btree_root {
|
||||
struct btree_iter;
|
||||
struct btree_node_iter;
|
||||
|
||||
enum extent_insert_hook_ret {
|
||||
BTREE_HOOK_DO_INSERT,
|
||||
BTREE_HOOK_NO_INSERT,
|
||||
BTREE_HOOK_RESTART_TRANS,
|
||||
};
|
||||
|
||||
struct extent_insert_hook {
|
||||
enum extent_insert_hook_ret
|
||||
(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
|
||||
struct bkey_s_c, const struct bkey_i *);
|
||||
};
|
||||
|
||||
enum btree_insert_ret {
|
||||
BTREE_INSERT_OK,
|
||||
/* extent spanned multiple leaf nodes: have to traverse to next node: */
|
||||
@ -342,6 +340,12 @@ enum btree_insert_ret {
|
||||
BTREE_INSERT_NEED_GC_LOCK,
|
||||
};
|
||||
|
||||
struct extent_insert_hook {
|
||||
enum btree_insert_ret
|
||||
(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
|
||||
struct bkey_s_c, const struct bkey_i *);
|
||||
};
|
||||
|
||||
enum btree_gc_coalesce_fail_reason {
|
||||
BTREE_GC_COALESCE_FAIL_RESERVE_GET,
|
||||
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
|
||||
|
@ -211,7 +211,7 @@ found:
|
||||
-c->opts.btree_node_size, true, b
|
||||
? gc_pos_btree_node(b)
|
||||
: gc_pos_btree_root(as->btree_id),
|
||||
&tmp, 0);
|
||||
&tmp, 0, 0);
|
||||
/*
|
||||
* Don't apply tmp - pending deletes aren't tracked in
|
||||
* bch_alloc_stats:
|
||||
@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
|
||||
BUG_ON(btree_node_dirty(b));
|
||||
BUG_ON(btree_node_need_write(b));
|
||||
BUG_ON(b == btree_node_root(c, b));
|
||||
BUG_ON(b->ob);
|
||||
BUG_ON(b->ob.nr);
|
||||
BUG_ON(!list_empty(&b->write_blocked));
|
||||
BUG_ON(b->will_make_reachable);
|
||||
|
||||
@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
|
||||
|
||||
void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct open_bucket *ob = b->ob;
|
||||
struct btree_ob_ref ob = b->ob;
|
||||
|
||||
btree_update_drop_new_node(c, b);
|
||||
|
||||
b->ob = NULL;
|
||||
b->ob.nr = 0;
|
||||
|
||||
clear_btree_node_dirty(b);
|
||||
|
||||
__btree_node_free(c, b, NULL);
|
||||
|
||||
bch2_open_bucket_put(c, ob);
|
||||
bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
|
||||
}
|
||||
|
||||
void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
|
||||
@ -287,7 +287,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
|
||||
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
|
||||
-c->opts.btree_node_size, true,
|
||||
gc_phase(GC_PHASE_PENDING_DELETE),
|
||||
&stats, 0);
|
||||
&stats, 0, 0);
|
||||
/*
|
||||
* Don't apply stats - pending deletes aren't tracked in
|
||||
* bch_alloc_stats:
|
||||
@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
|
||||
|
||||
void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
bch2_open_bucket_put(c, b->ob);
|
||||
b->ob = NULL;
|
||||
bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
|
||||
}
|
||||
|
||||
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
||||
@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
||||
struct closure *cl,
|
||||
unsigned flags)
|
||||
{
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct open_bucket *ob;
|
||||
struct write_point *wp;
|
||||
struct btree *b;
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct bkey_i_extent *e;
|
||||
struct btree_ob_ref ob;
|
||||
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
|
||||
unsigned nr_reserve;
|
||||
enum alloc_reserve alloc_reserve;
|
||||
|
||||
@ -335,31 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
||||
mutex_unlock(&c->btree_reserve_cache_lock);
|
||||
|
||||
retry:
|
||||
/* alloc_sectors is weird, I suppose */
|
||||
bkey_extent_init(&tmp.k);
|
||||
tmp.k.k.size = c->opts.btree_node_size,
|
||||
wp = bch2_alloc_sectors_start(c, NULL,
|
||||
writepoint_ptr(&c->btree_write_point),
|
||||
&devs_have,
|
||||
res->nr_replicas,
|
||||
c->opts.metadata_replicas_required,
|
||||
alloc_reserve, 0, cl);
|
||||
if (IS_ERR(wp))
|
||||
return ERR_CAST(wp);
|
||||
|
||||
ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
|
||||
bkey_i_to_extent(&tmp.k),
|
||||
res->nr_replicas,
|
||||
c->opts.metadata_replicas_required,
|
||||
alloc_reserve, 0, cl);
|
||||
if (IS_ERR(ob))
|
||||
return ERR_CAST(ob);
|
||||
if (wp->sectors_free < c->opts.btree_node_size) {
|
||||
struct open_bucket *ob;
|
||||
unsigned i;
|
||||
|
||||
if (tmp.k.k.size < c->opts.btree_node_size) {
|
||||
bch2_open_bucket_put(c, ob);
|
||||
writepoint_for_each_ptr(wp, ob, i)
|
||||
if (ob->sectors_free < c->opts.btree_node_size)
|
||||
ob->sectors_free = 0;
|
||||
|
||||
bch2_alloc_sectors_done(c, wp);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
e = bkey_extent_init(&tmp.k);
|
||||
bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
|
||||
|
||||
ob.nr = 0;
|
||||
bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
|
||||
bch2_alloc_sectors_done(c, wp);
|
||||
mem_alloc:
|
||||
b = bch2_btree_node_mem_alloc(c);
|
||||
|
||||
/* we hold cannibalize_lock: */
|
||||
BUG_ON(IS_ERR(b));
|
||||
BUG_ON(b->ob);
|
||||
BUG_ON(b->ob.nr);
|
||||
|
||||
bkey_copy(&b->key, &tmp.k);
|
||||
b->key.k.size = 0;
|
||||
b->ob = ob;
|
||||
|
||||
return b;
|
||||
@ -466,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
|
||||
&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
|
||||
|
||||
a->ob = b->ob;
|
||||
b->ob = NULL;
|
||||
b->ob.nr = 0;
|
||||
bkey_copy(&a->k, &b->key);
|
||||
} else {
|
||||
bch2_open_bucket_put(c, b->ob);
|
||||
b->ob = NULL;
|
||||
bch2_btree_open_bucket_put(c, b);
|
||||
}
|
||||
|
||||
__btree_node_free(c, b, NULL);
|
||||
@ -857,10 +868,7 @@ static void __btree_interior_update_drop_new_node(struct btree *b)
|
||||
|
||||
BUG();
|
||||
found:
|
||||
as->nr_new_nodes--;
|
||||
memmove(&as->new_nodes[i],
|
||||
&as->new_nodes[i + 1],
|
||||
sizeof(struct btree *) * (as->nr_new_nodes - i));
|
||||
array_remove_item(as->new_nodes, as->nr_new_nodes, i);
|
||||
b->will_make_reachable = NULL;
|
||||
}
|
||||
|
||||
@ -1000,8 +1008,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
|
||||
as->reserve = reserve;
|
||||
INIT_LIST_HEAD(&as->write_blocked_list);
|
||||
|
||||
bch2_keylist_init(&as->parent_keys, as->inline_keys,
|
||||
ARRAY_SIZE(as->inline_keys));
|
||||
bch2_keylist_init(&as->parent_keys, as->inline_keys);
|
||||
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
list_add(&as->list, &c->btree_interior_update_list);
|
||||
@ -1037,7 +1044,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
|
||||
bch2_mark_key(c, bkey_i_to_s_c(&b->key),
|
||||
c->opts.btree_node_size, true,
|
||||
gc_pos_btree_root(b->btree_id),
|
||||
&stats, 0);
|
||||
&stats, 0, 0);
|
||||
|
||||
if (old)
|
||||
bch2_btree_node_free_index(as, NULL,
|
||||
@ -1121,7 +1128,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
|
||||
if (bkey_extent_is_data(&insert->k))
|
||||
bch2_mark_key(c, bkey_i_to_s_c(insert),
|
||||
c->opts.btree_node_size, true,
|
||||
gc_pos_btree_node(b), &stats, 0);
|
||||
gc_pos_btree_node(b), &stats, 0, 0);
|
||||
|
||||
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
|
||||
!btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
|
||||
@ -1479,6 +1486,13 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
|
||||
struct closure cl;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We already have a disk reservation and open buckets pinned; this
|
||||
* allocation must not block:
|
||||
*/
|
||||
if (iter->btree_id == BTREE_ID_EXTENTS)
|
||||
btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
/* Hack, because gc and splitting nodes doesn't mix yet: */
|
||||
@ -1519,6 +1533,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
|
||||
bch2_btree_iter_set_locks_want(iter, 1);
|
||||
out:
|
||||
up_read(&c->gc_lock);
|
||||
closure_sync(&cl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1904,7 +1919,7 @@ retry:
|
||||
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
|
||||
c->opts.btree_node_size, true,
|
||||
gc_pos_btree_root(b->btree_id),
|
||||
&stats, 0);
|
||||
&stats, 0, 0);
|
||||
bch2_btree_node_free_index(as, NULL,
|
||||
bkey_i_to_s_c(&b->key),
|
||||
&stats);
|
||||
@ -1928,6 +1943,7 @@ out:
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
up_read(&c->gc_lock);
|
||||
closure_sync(&cl);
|
||||
return ret;
|
||||
err:
|
||||
if (as)
|
||||
@ -1965,13 +1981,13 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
|
||||
BTREE_INSERT_USE_RESERVE|
|
||||
BTREE_INSERT_USE_ALLOC_RESERVE,
|
||||
&cl);
|
||||
closure_sync(&cl);
|
||||
|
||||
if (!IS_ERR(as))
|
||||
break;
|
||||
|
||||
if (PTR_ERR(as) == -ENOSPC)
|
||||
return PTR_ERR(as);
|
||||
|
||||
closure_sync(&cl);
|
||||
}
|
||||
|
||||
b = __btree_root_alloc(as, 0);
|
||||
|
@ -355,6 +355,11 @@ retry:
|
||||
|
||||
multi_lock_write(c, trans);
|
||||
|
||||
if (race_fault()) {
|
||||
ret = -EINTR;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
u64s = 0;
|
||||
trans_for_each_entry(trans, i) {
|
||||
/* Multiple inserts might go to same leaf: */
|
||||
|
@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
|
||||
stats.online_reserved);
|
||||
}
|
||||
|
||||
static void bch2_dev_stats_verify(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_dev_usage stats =
|
||||
__bch2_dev_usage_read(ca);
|
||||
u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
|
||||
|
||||
BUG_ON(stats.buckets[S_META] > n);
|
||||
BUG_ON(stats.buckets[S_DIRTY] > n);
|
||||
BUG_ON(stats.buckets_cached > n);
|
||||
BUG_ON(stats.buckets_alloc > n);
|
||||
BUG_ON(stats.buckets_unavailable > n);
|
||||
}
|
||||
|
||||
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
|
||||
{
|
||||
if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
|
||||
u64 used = __bch2_fs_sectors_used(c);
|
||||
u64 cached = 0;
|
||||
u64 avail = atomic64_read(&c->sectors_available);
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
|
||||
|
||||
if (used + avail + cached > c->capacity)
|
||||
panic("used %llu avail %llu cached %llu capacity %llu\n",
|
||||
used, avail, cached, c->capacity);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void bch2_fs_stats_verify(struct bch_fs *c) {}
|
||||
static void bch2_dev_stats_verify(struct bch_dev *ca) {}
|
||||
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
|
||||
|
||||
#endif
|
||||
|
||||
@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
|
||||
return bch2_usage_read_raw(ca->usage_percpu);
|
||||
}
|
||||
|
||||
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
|
||||
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return bch2_usage_read_cached(ca->fs,
|
||||
ca->usage_cached,
|
||||
ca->usage_percpu);
|
||||
return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
|
||||
}
|
||||
|
||||
struct bch_fs_usage
|
||||
@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m)
|
||||
!m.dirty_sectors && !!m.cached_sectors;
|
||||
}
|
||||
|
||||
static inline int is_unavailable_bucket(struct bucket_mark m)
|
||||
{
|
||||
return !is_available_bucket(m);
|
||||
}
|
||||
|
||||
static inline enum s_alloc bucket_type(struct bucket_mark m)
|
||||
{
|
||||
return is_meta_bucket(m) ? S_META : S_DIRTY;
|
||||
@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c,
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
}
|
||||
|
||||
static void bch2_dev_usage_update(struct bch_dev *ca,
|
||||
struct bucket_mark old, struct bucket_mark new)
|
||||
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bucket *g, struct bucket_mark old,
|
||||
struct bucket_mark new)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_dev_usage *dev_usage;
|
||||
|
||||
BUG_ON((g - ca->buckets) < ca->mi.first_bucket ||
|
||||
(g - ca->buckets) >= ca->mi.nbuckets);
|
||||
|
||||
bch2_fs_inconsistent_on(old.data_type && new.data_type &&
|
||||
old.data_type != new.data_type, c,
|
||||
"different types of metadata in same bucket: %u, %u",
|
||||
@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
|
||||
preempt_disable();
|
||||
dev_usage = this_cpu_ptr(ca->usage_percpu);
|
||||
|
||||
dev_usage->sectors_cached +=
|
||||
(int) new.cached_sectors - (int) old.cached_sectors;
|
||||
dev_usage->buckets[S_META] +=
|
||||
is_meta_bucket(new) - is_meta_bucket(old);
|
||||
dev_usage->buckets[S_DIRTY] +=
|
||||
is_dirty_bucket(new) - is_dirty_bucket(old);
|
||||
dev_usage->buckets_cached +=
|
||||
is_cached_bucket(new) - is_cached_bucket(old);
|
||||
dev_usage->buckets_alloc +=
|
||||
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
|
||||
dev_usage->buckets_unavailable +=
|
||||
is_unavailable_bucket(new) - is_unavailable_bucket(old);
|
||||
|
||||
dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
|
||||
dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
|
||||
|
||||
dev_usage->buckets_alloc +=
|
||||
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
|
||||
|
||||
dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
|
||||
dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
|
||||
dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
|
||||
dev_usage->sectors_cached +=
|
||||
(int) new.cached_sectors - (int) old.cached_sectors;
|
||||
preempt_enable();
|
||||
|
||||
if (!is_available_bucket(old) && is_available_bucket(new))
|
||||
bch2_wake_allocator(ca);
|
||||
|
||||
bch2_dev_stats_verify(ca);
|
||||
}
|
||||
|
||||
#define bucket_data_cmpxchg(ca, g, new, expr) \
|
||||
#define bucket_data_cmpxchg(c, ca, g, new, expr) \
|
||||
({ \
|
||||
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
|
||||
\
|
||||
bch2_dev_usage_update(ca, _old, new); \
|
||||
bch2_dev_usage_update(c, ca, g, _old, new); \
|
||||
_old; \
|
||||
})
|
||||
|
||||
bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
struct bucket_mark *old)
|
||||
bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bucket *g, struct bucket_mark *old)
|
||||
{
|
||||
struct bucket_mark new;
|
||||
|
||||
*old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
lg_local_lock(&c->usage_lock);
|
||||
*old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
if (!is_available_bucket(new))
|
||||
return false;
|
||||
|
||||
@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
new.dirty_sectors = 0;
|
||||
new.gen++;
|
||||
}));
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
|
||||
if (!old->owned_by_allocator && old->cached_sectors)
|
||||
trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
|
||||
@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
|
||||
bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bucket *g)
|
||||
{
|
||||
struct bucket_mark new, old;
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
lg_local_lock(&c->usage_lock);
|
||||
old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
if (new.touched_this_mount ||
|
||||
!is_available_bucket(new))
|
||||
return false;
|
||||
@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
|
||||
new.owned_by_allocator = 1;
|
||||
new.touched_this_mount = 1;
|
||||
}));
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
|
||||
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bucket *g, bool owned_by_allocator,
|
||||
struct gc_pos pos, unsigned flags)
|
||||
{
|
||||
struct bucket_mark old, new;
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
new.touched_this_mount = 1;
|
||||
new.owned_by_allocator = 0;
|
||||
new.data_type = 0;
|
||||
new.cached_sectors = 0;
|
||||
new.dirty_sectors = 0;
|
||||
}));
|
||||
lg_local_lock(&c->usage_lock);
|
||||
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
|
||||
gc_will_visit(c, pos)) {
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
BUG_ON(bucket_became_unavailable(ca->fs, old, new));
|
||||
}
|
||||
|
||||
void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
bool owned_by_allocator)
|
||||
{
|
||||
struct bucket_mark old, new;
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
new.touched_this_mount = 1;
|
||||
new.owned_by_allocator = owned_by_allocator;
|
||||
}));
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
|
||||
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
|
||||
ca->fs->gc_pos.phase == GC_PHASE_DONE);
|
||||
c->gc_pos.phase == GC_PHASE_DONE);
|
||||
}
|
||||
|
||||
#define saturated_add(ca, dst, src, max) \
|
||||
@ -377,41 +419,49 @@ do { \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
|
||||
enum bucket_data_type type,
|
||||
bool may_make_unavailable)
|
||||
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bucket *g, enum bucket_data_type type,
|
||||
struct gc_pos pos, unsigned flags)
|
||||
{
|
||||
struct bucket_mark old, new;
|
||||
|
||||
BUG_ON(!type);
|
||||
|
||||
old = bucket_data_cmpxchg(ca, g, new, ({
|
||||
lg_local_lock(&c->usage_lock);
|
||||
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
|
||||
gc_will_visit(c, pos)) {
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
|
||||
GC_MAX_SECTORS_USED);
|
||||
new.data_type = type;
|
||||
new.touched_this_mount = 1;
|
||||
}));
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
|
||||
if (old.data_type != type &&
|
||||
(old.data_type ||
|
||||
old.cached_sectors ||
|
||||
old.dirty_sectors))
|
||||
bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
|
||||
bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
|
||||
g - ca->buckets, old.data_type, new.data_type);
|
||||
|
||||
BUG_ON(!may_make_unavailable &&
|
||||
bucket_became_unavailable(ca->fs, old, new));
|
||||
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
|
||||
bucket_became_unavailable(c, old, new));
|
||||
}
|
||||
|
||||
/* Reverting this until the copygc + compression issue is fixed: */
|
||||
|
||||
static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
|
||||
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
|
||||
{
|
||||
if (!sectors)
|
||||
return 0;
|
||||
|
||||
return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
|
||||
crc_uncompressed_size(NULL, crc)));
|
||||
return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
|
||||
crc.uncompressed_size));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -420,12 +470,12 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
|
||||
* that with the gc pos seqlock held.
|
||||
*/
|
||||
static void bch2_mark_pointer(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
const union bch_extent_crc *crc,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
s64 sectors, enum s_alloc type,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
struct bkey_s_c_extent e,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
struct bch_extent_crc_unpacked crc,
|
||||
s64 sectors, enum s_alloc type,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
{
|
||||
struct bucket_mark old, new;
|
||||
unsigned saturated;
|
||||
@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
? BUCKET_BTREE : BUCKET_DATA;
|
||||
u64 v;
|
||||
|
||||
if (crc_compression_type(crc)) {
|
||||
if (crc.compression_type) {
|
||||
unsigned old_sectors, new_sectors;
|
||||
|
||||
if (sectors > 0) {
|
||||
@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
old.counter,
|
||||
new.counter)) != old.counter);
|
||||
|
||||
bch2_dev_usage_update(ca, old, new);
|
||||
bch2_dev_usage_update(c, ca, g, old, new);
|
||||
|
||||
if (old.data_type != data_type &&
|
||||
(old.data_type ||
|
||||
old.cached_sectors ||
|
||||
old.dirty_sectors))
|
||||
bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
|
||||
bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
|
||||
g - ca->buckets, old.data_type, new.data_type);
|
||||
|
||||
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
|
||||
@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
|
||||
s64 sectors, bool metadata,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
const union bch_extent_crc *crc;
|
||||
enum s_alloc type = metadata ? S_META : S_DIRTY;
|
||||
unsigned replicas = 0;
|
||||
|
||||
BUG_ON(metadata && bkey_extent_is_cached(e.k));
|
||||
BUG_ON(!sectors);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
bch2_mark_pointer(c, e, crc, ptr, sectors, type,
|
||||
stats, journal_seq, flags);
|
||||
replicas += !ptr->cached;
|
||||
}
|
||||
|
||||
BUG_ON(replicas >= BCH_REPLICAS_MAX);
|
||||
|
||||
if (replicas)
|
||||
stats->s[replicas - 1].data[type] += sectors;
|
||||
}
|
||||
|
||||
void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
|
||||
s64 sectors, bool metadata,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
|
||||
stats, journal_seq, flags);
|
||||
break;
|
||||
case BCH_RESERVATION: {
|
||||
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
|
||||
|
||||
if (r.v->nr_replicas)
|
||||
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
|
||||
s64 sectors, bool metadata, unsigned flags)
|
||||
{
|
||||
struct bch_fs_usage stats = { 0 };
|
||||
|
||||
__bch2_mark_key(c, k, sectors, metadata, &stats, 0,
|
||||
flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
|
||||
|
||||
preempt_disable();
|
||||
bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
|
||||
s64 sectors, bool metadata, struct gc_pos gc_pos,
|
||||
struct bch_fs_usage *stats, u64 journal_seq)
|
||||
s64 sectors, bool metadata,
|
||||
struct gc_pos pos,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
{
|
||||
unsigned flags = gc_will_visit(c, gc_pos)
|
||||
? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
|
||||
/*
|
||||
* synchronization w.r.t. GC:
|
||||
*
|
||||
@ -614,69 +605,104 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
|
||||
* To know whether we should mark a given reference (GC either isn't
|
||||
* running, or has already marked references at this position) we
|
||||
* construct a total order for everything GC walks. Then, we can simply
|
||||
* compare the position of the reference we're marking - @gc_pos - with
|
||||
* compare the position of the reference we're marking - @pos - with
|
||||
* GC's current position. If GC is going to mark this reference, GC's
|
||||
* current position will be less than @gc_pos; if GC's current position
|
||||
* is greater than @gc_pos GC has either already walked this position,
|
||||
* or isn't running.
|
||||
* current position will be less than @pos; if GC's current position is
|
||||
* greater than @pos GC has either already walked this position, or
|
||||
* isn't running.
|
||||
*
|
||||
* To avoid racing with GC's position changing, we have to deal with
|
||||
* - GC's position being set to GC_POS_MIN when GC starts:
|
||||
* usage_lock guards against this
|
||||
* - GC's position overtaking @gc_pos: we guard against this with
|
||||
* - GC's position overtaking @pos: we guard against this with
|
||||
* whatever lock protects the data structure the reference lives in
|
||||
* (e.g. the btree node lock, or the relevant allocator lock).
|
||||
*/
|
||||
|
||||
lg_local_lock(&c->usage_lock);
|
||||
__bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
|
||||
bch2_fs_stats_verify(c);
|
||||
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
|
||||
gc_will_visit(c, pos))
|
||||
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED: {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
enum s_alloc type = metadata ? S_META : S_DIRTY;
|
||||
unsigned replicas = 0;
|
||||
|
||||
BUG_ON(metadata && bkey_extent_is_cached(e.k));
|
||||
BUG_ON(!sectors);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
bch2_mark_pointer(c, e, ptr, crc, sectors, type,
|
||||
stats, journal_seq, flags);
|
||||
replicas += !ptr->cached;
|
||||
}
|
||||
|
||||
BUG_ON(replicas >= BCH_REPLICAS_MAX);
|
||||
|
||||
if (replicas)
|
||||
stats->s[replicas - 1].data[type] += sectors;
|
||||
break;
|
||||
}
|
||||
case BCH_RESERVATION: {
|
||||
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
|
||||
|
||||
if (r.v->nr_replicas)
|
||||
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
|
||||
break;
|
||||
}
|
||||
}
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
}
|
||||
|
||||
/* Disk reservations: */
|
||||
|
||||
static u64 __recalc_sectors_available(struct bch_fs *c)
|
||||
{
|
||||
return c->capacity - bch2_fs_sectors_used(c);
|
||||
u64 avail;
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
|
||||
|
||||
avail = c->capacity - bch2_fs_sectors_used(c);
|
||||
|
||||
avail <<= RESERVE_FACTOR;
|
||||
avail /= (1 << RESERVE_FACTOR) + 1;
|
||||
return avail;
|
||||
}
|
||||
|
||||
/* Used by gc when it's starting: */
|
||||
void bch2_recalc_sectors_available(struct bch_fs *c)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
lg_global_lock(&c->usage_lock);
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
|
||||
|
||||
atomic64_set(&c->sectors_available,
|
||||
__recalc_sectors_available(c));
|
||||
|
||||
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
|
||||
lg_global_unlock(&c->usage_lock);
|
||||
}
|
||||
|
||||
void bch2_disk_reservation_put(struct bch_fs *c,
|
||||
struct disk_reservation *res)
|
||||
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
|
||||
{
|
||||
if (res->sectors) {
|
||||
lg_local_lock(&c->usage_lock);
|
||||
this_cpu_sub(c->usage_percpu->online_reserved,
|
||||
res->sectors);
|
||||
lg_local_lock(&c->usage_lock);
|
||||
this_cpu_sub(c->usage_percpu->online_reserved,
|
||||
res->sectors);
|
||||
|
||||
bch2_fs_stats_verify(c);
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
bch2_fs_stats_verify(c);
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
|
||||
res->sectors = 0;
|
||||
}
|
||||
res->sectors = 0;
|
||||
}
|
||||
|
||||
#define SECTORS_CACHE 1024
|
||||
|
||||
int bch2_disk_reservation_add(struct bch_fs *c,
|
||||
struct disk_reservation *res,
|
||||
unsigned sectors, int flags)
|
||||
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
|
||||
unsigned sectors, int flags)
|
||||
{
|
||||
struct bch_fs_usage *stats;
|
||||
u64 old, new, v;
|
||||
u64 old, v, get;
|
||||
s64 sectors_available;
|
||||
int ret;
|
||||
|
||||
@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c,
|
||||
lg_local_lock(&c->usage_lock);
|
||||
stats = this_cpu_ptr(c->usage_percpu);
|
||||
|
||||
if (sectors >= stats->available_cache)
|
||||
if (sectors <= stats->available_cache)
|
||||
goto out;
|
||||
|
||||
v = atomic64_read(&c->sectors_available);
|
||||
do {
|
||||
old = v;
|
||||
if (old < sectors) {
|
||||
get = min((u64) sectors + SECTORS_CACHE, old);
|
||||
|
||||
if (get < sectors) {
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
goto recalculate;
|
||||
}
|
||||
|
||||
new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
|
||||
} while ((v = atomic64_cmpxchg(&c->sectors_available,
|
||||
old, new)) != old);
|
||||
old, old - get)) != old);
|
||||
|
||||
stats->available_cache += get;
|
||||
|
||||
stats->available_cache += old - new;
|
||||
out:
|
||||
stats->available_cache -= sectors;
|
||||
stats->online_reserved += sectors;
|
||||
res->sectors += sectors;
|
||||
|
||||
bch2_disk_reservations_verify(c, flags);
|
||||
bch2_fs_stats_verify(c);
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
return 0;
|
||||
@ -738,6 +766,8 @@ recalculate:
|
||||
stats->online_reserved += sectors;
|
||||
res->sectors += sectors;
|
||||
ret = 0;
|
||||
|
||||
bch2_disk_reservations_verify(c, flags);
|
||||
} else {
|
||||
atomic64_set(&c->sectors_available, sectors_available);
|
||||
ret = -ENOSPC;
|
||||
|
@ -95,37 +95,39 @@ static inline bool bucket_unused(struct bucket_mark mark)
|
||||
/* Per device stats: */
|
||||
|
||||
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
|
||||
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
|
||||
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
static inline u64 __dev_buckets_available(struct bch_dev *ca,
|
||||
struct bch_dev_usage stats)
|
||||
{
|
||||
return max_t(s64, 0,
|
||||
ca->mi.nbuckets - ca->mi.first_bucket -
|
||||
stats.buckets[S_META] -
|
||||
stats.buckets[S_DIRTY] -
|
||||
stats.buckets_alloc);
|
||||
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
|
||||
|
||||
if (WARN_ONCE(stats.buckets_unavailable > total,
|
||||
"buckets_unavailable overflow\n"))
|
||||
return 0;
|
||||
|
||||
return total - stats.buckets_unavailable;
|
||||
}
|
||||
|
||||
/*
|
||||
* Number of reclaimable buckets - only for use by the allocator thread:
|
||||
*/
|
||||
static inline u64 dev_buckets_available(struct bch_dev *ca)
|
||||
static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
|
||||
return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
|
||||
}
|
||||
|
||||
static inline u64 __dev_buckets_free(struct bch_dev *ca,
|
||||
struct bch_dev_usage stats)
|
||||
struct bch_dev_usage stats)
|
||||
{
|
||||
return __dev_buckets_available(ca, stats) +
|
||||
fifo_used(&ca->free[RESERVE_NONE]) +
|
||||
fifo_used(&ca->free_inc);
|
||||
}
|
||||
|
||||
static inline u64 dev_buckets_free(struct bch_dev *ca)
|
||||
static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
|
||||
return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
|
||||
}
|
||||
|
||||
/* Cache set stats: */
|
||||
@ -133,7 +135,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca)
|
||||
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
|
||||
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
|
||||
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
|
||||
struct disk_reservation *, struct gc_pos);
|
||||
struct disk_reservation *, struct gc_pos);
|
||||
|
||||
struct fs_usage_sum {
|
||||
u64 data;
|
||||
@ -155,11 +157,18 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define RESERVE_FACTOR 6
|
||||
|
||||
static u64 reserve_factor(u64 r)
|
||||
{
|
||||
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
|
||||
}
|
||||
|
||||
static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
|
||||
{
|
||||
struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));
|
||||
|
||||
return sum.data + sum.reserved + (sum.reserved >> 7);
|
||||
return sum.data + reserve_factor(sum.reserved);
|
||||
}
|
||||
|
||||
static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
|
||||
@ -184,30 +193,35 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
|
||||
|
||||
void bch2_bucket_seq_cleanup(struct bch_fs *);
|
||||
|
||||
bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
|
||||
struct bucket_mark *);
|
||||
bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
|
||||
void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
|
||||
void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
|
||||
void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
|
||||
enum bucket_data_type, bool);
|
||||
bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
|
||||
struct bucket *, struct bucket_mark *);
|
||||
bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
|
||||
struct bucket *);
|
||||
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
|
||||
struct bucket *, bool,
|
||||
struct gc_pos, unsigned);
|
||||
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
|
||||
struct bucket *, enum bucket_data_type,
|
||||
struct gc_pos, unsigned);
|
||||
|
||||
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
|
||||
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 1)
|
||||
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 2)
|
||||
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
|
||||
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
|
||||
#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
|
||||
|
||||
void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
|
||||
struct bch_fs_usage *, u64, unsigned);
|
||||
|
||||
void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
|
||||
s64, bool, unsigned);
|
||||
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
|
||||
struct gc_pos, struct bch_fs_usage *, u64);
|
||||
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
|
||||
struct bch_fs_usage *, u64, unsigned);
|
||||
|
||||
void bch2_recalc_sectors_available(struct bch_fs *);
|
||||
|
||||
void bch2_disk_reservation_put(struct bch_fs *,
|
||||
struct disk_reservation *);
|
||||
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
|
||||
|
||||
static inline void bch2_disk_reservation_put(struct bch_fs *c,
|
||||
struct disk_reservation *res)
|
||||
{
|
||||
if (res->sectors)
|
||||
__bch2_disk_reservation_put(c, res);
|
||||
}
|
||||
|
||||
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
|
||||
#define BCH_DISK_RESERVATION_METADATA (1 << 1)
|
||||
|
@ -59,6 +59,7 @@ struct bch_dev_usage {
|
||||
u64 buckets[S_ALLOC_NR];
|
||||
u64 buckets_cached;
|
||||
u64 buckets_alloc;
|
||||
u64 buckets_unavailable;
|
||||
|
||||
/* _compressed_ sectors: */
|
||||
u64 sectors[S_ALLOC_NR];
|
||||
@ -79,13 +80,6 @@ struct bch_fs_usage {
|
||||
u64 available_cache;
|
||||
};
|
||||
|
||||
struct bucket_heap_entry {
|
||||
size_t bucket;
|
||||
struct bucket_mark mark;
|
||||
};
|
||||
|
||||
typedef HEAP(struct bucket_heap_entry) bucket_heap;
|
||||
|
||||
/*
|
||||
* A reservation for space on disk:
|
||||
*/
|
||||
@ -95,4 +89,11 @@ struct disk_reservation {
|
||||
unsigned nr_replicas;
|
||||
};
|
||||
|
||||
struct copygc_heap_entry {
|
||||
u64 offset;
|
||||
struct bucket_mark mark;
|
||||
};
|
||||
|
||||
typedef HEAP(struct copygc_heap_entry) copygc_heap;
|
||||
|
||||
#endif /* _BUCKETS_TYPES_H */
|
||||
|
@ -141,10 +141,14 @@ static u64 bch2_checksum_init(unsigned type)
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
return U32_MAX;
|
||||
case BCH_CSUM_CRC64:
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
return U64_MAX;
|
||||
case BCH_CSUM_CRC32C:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC64:
|
||||
return 0;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
@ -155,10 +159,14 @@ static u64 bch2_checksum_final(unsigned type, u64 crc)
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
return crc ^ U32_MAX;
|
||||
case BCH_CSUM_CRC64:
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
return crc ^ U64_MAX;
|
||||
case BCH_CSUM_CRC32C:
|
||||
return crc;
|
||||
case BCH_CSUM_CRC64:
|
||||
return crc;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
@ -169,8 +177,10 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
case BCH_CSUM_CRC32C:
|
||||
return crc32c(crc, data, len);
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
case BCH_CSUM_CRC64:
|
||||
return bch2_crc64_update(crc, data, len);
|
||||
default:
|
||||
@ -243,6 +253,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC64: {
|
||||
u64 crc = bch2_checksum_init(type);
|
||||
@ -250,7 +262,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
|
||||
crc = bch2_checksum_update(type, crc, data, len);
|
||||
crc = bch2_checksum_final(type, crc);
|
||||
|
||||
return (struct bch_csum) { .lo = crc };
|
||||
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
|
||||
}
|
||||
|
||||
case BCH_CSUM_CHACHA20_POLY1305_80:
|
||||
@ -281,28 +293,36 @@ void bch2_encrypt(struct bch_fs *c, unsigned type,
|
||||
do_encrypt(c->chacha20, nonce, data, len);
|
||||
}
|
||||
|
||||
struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio)
|
||||
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio,
|
||||
struct bvec_iter *iter)
|
||||
{
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return (struct bch_csum) { 0 };
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC64: {
|
||||
u64 crc = bch2_checksum_init(type);
|
||||
|
||||
bio_for_each_contig_segment(bv, bio, iter) {
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
__bio_for_each_segment(bv, bio, *iter, *iter) {
|
||||
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
|
||||
crc = bch2_checksum_update(type,
|
||||
crc, p, bv.bv_len);
|
||||
kunmap_atomic(p);
|
||||
}
|
||||
|
||||
#else
|
||||
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
|
||||
crc = bch2_checksum_update(type, crc,
|
||||
page_address(bv.bv_page) + bv.bv_offset,
|
||||
bv.bv_len);
|
||||
#endif
|
||||
crc = bch2_checksum_final(type, crc);
|
||||
return (struct bch_csum) { .lo = crc };
|
||||
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
|
||||
}
|
||||
|
||||
case BCH_CSUM_CHACHA20_POLY1305_80:
|
||||
@ -313,13 +333,19 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
|
||||
gen_poly_key(c, desc, nonce);
|
||||
|
||||
bio_for_each_contig_segment(bv, bio, iter) {
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
__bio_for_each_segment(bv, bio, *iter, *iter) {
|
||||
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
|
||||
|
||||
crypto_shash_update(desc, p, bv.bv_len);
|
||||
kunmap_atomic(p);
|
||||
}
|
||||
|
||||
#else
|
||||
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
|
||||
crypto_shash_update(desc,
|
||||
page_address(bv.bv_page) + bv.bv_offset,
|
||||
bv.bv_len);
|
||||
#endif
|
||||
crypto_shash_final(desc, digest);
|
||||
|
||||
memcpy(&ret, digest, bch_crc_bytes[type]);
|
||||
@ -330,6 +356,14 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
}
|
||||
}
|
||||
|
||||
struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio)
|
||||
{
|
||||
struct bvec_iter iter = bio->bi_iter;
|
||||
|
||||
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
|
||||
}
|
||||
|
||||
void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio)
|
||||
{
|
||||
@ -343,12 +377,12 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
|
||||
sg_init_table(sgl, ARRAY_SIZE(sgl));
|
||||
|
||||
bio_for_each_contig_segment(bv, bio, iter) {
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
if (sg == sgl + ARRAY_SIZE(sgl)) {
|
||||
sg_mark_end(sg - 1);
|
||||
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
||||
|
||||
le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
|
||||
nonce = nonce_add(nonce, bytes);
|
||||
bytes = 0;
|
||||
|
||||
sg_init_table(sgl, ARRAY_SIZE(sgl));
|
||||
@ -357,13 +391,115 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
|
||||
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
|
||||
bytes += bv.bv_len;
|
||||
|
||||
}
|
||||
|
||||
sg_mark_end(sg - 1);
|
||||
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
||||
}
|
||||
|
||||
static inline bool bch2_checksum_mergeable(unsigned type)
|
||||
{
|
||||
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC64:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bch_csum bch2_checksum_merge(unsigned type,
|
||||
struct bch_csum a,
|
||||
struct bch_csum b, size_t b_len)
|
||||
{
|
||||
BUG_ON(!bch2_checksum_mergeable(type));
|
||||
|
||||
while (b_len) {
|
||||
unsigned b = min(b_len, PAGE_SIZE);
|
||||
|
||||
a.lo = bch2_checksum_update(type, a.lo,
|
||||
page_address(ZERO_PAGE(0)), b);
|
||||
b_len -= b;
|
||||
}
|
||||
|
||||
a.lo ^= b.lo;
|
||||
a.hi ^= b.hi;
|
||||
return a;
|
||||
}
|
||||
|
||||
int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
||||
struct bversion version,
|
||||
struct bch_extent_crc_unpacked crc_old,
|
||||
struct bch_extent_crc_unpacked *crc_a,
|
||||
struct bch_extent_crc_unpacked *crc_b,
|
||||
unsigned len_a, unsigned len_b,
|
||||
unsigned new_csum_type)
|
||||
{
|
||||
struct bvec_iter iter = bio->bi_iter;
|
||||
struct nonce nonce = extent_nonce(version, crc_old);
|
||||
struct bch_csum merged = { 0 };
|
||||
struct crc_split {
|
||||
struct bch_extent_crc_unpacked *crc;
|
||||
unsigned len;
|
||||
unsigned csum_type;
|
||||
struct bch_csum csum;
|
||||
} splits[3] = {
|
||||
{ crc_a, len_a, new_csum_type },
|
||||
{ crc_b, len_b, new_csum_type },
|
||||
{ NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
|
||||
}, *i;
|
||||
bool mergeable = crc_old.csum_type == new_csum_type &&
|
||||
bch2_checksum_mergeable(new_csum_type);
|
||||
unsigned crc_nonce = crc_old.nonce;
|
||||
|
||||
BUG_ON(len_a + len_b > bio_sectors(bio));
|
||||
BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
|
||||
BUG_ON(crc_old.compression_type);
|
||||
BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
|
||||
bch2_csum_type_is_encryption(new_csum_type));
|
||||
|
||||
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
||||
iter.bi_size = i->len << 9;
|
||||
if (mergeable || i->crc)
|
||||
i->csum = __bch2_checksum_bio(c, i->csum_type,
|
||||
nonce, bio, &iter);
|
||||
else
|
||||
bio_advance_iter(bio, &iter, i->len << 9);
|
||||
nonce = nonce_add(nonce, i->len << 9);
|
||||
}
|
||||
|
||||
if (mergeable)
|
||||
for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
|
||||
merged = bch2_checksum_merge(new_csum_type, merged,
|
||||
i->csum, i->len << 9);
|
||||
else
|
||||
merged = bch2_checksum_bio(c, crc_old.csum_type,
|
||||
extent_nonce(version, crc_old), bio);
|
||||
|
||||
if (bch2_crc_cmp(merged, crc_old.csum))
|
||||
return -EIO;
|
||||
|
||||
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
||||
if (i->crc)
|
||||
*i->crc = (struct bch_extent_crc_unpacked) {
|
||||
.csum_type = i->csum_type,
|
||||
.compressed_size = i->len,
|
||||
.uncompressed_size = i->len,
|
||||
.offset = 0,
|
||||
.live_size = i->len,
|
||||
.nonce = crc_nonce,
|
||||
.csum = i->csum,
|
||||
};
|
||||
|
||||
if (bch2_csum_type_is_encryption(new_csum_type))
|
||||
crc_nonce += i->len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
|
||||
{
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define _BCACHEFS_CHECKSUM_H
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "extents_types.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <crypto/chacha20.h>
|
||||
@ -36,7 +37,14 @@ void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
|
||||
void *data, size_t);
|
||||
|
||||
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
|
||||
struct nonce, struct bio *);
|
||||
struct nonce, struct bio *);
|
||||
|
||||
int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
|
||||
struct bch_extent_crc_unpacked,
|
||||
struct bch_extent_crc_unpacked *,
|
||||
struct bch_extent_crc_unpacked *,
|
||||
unsigned, unsigned, unsigned);
|
||||
|
||||
void bch2_encrypt_bio(struct bch_fs *, unsigned,
|
||||
struct nonce, struct bio *);
|
||||
|
||||
@ -49,15 +57,16 @@ int bch2_enable_encryption(struct bch_fs *, bool);
|
||||
void bch2_fs_encryption_exit(struct bch_fs *);
|
||||
int bch2_fs_encryption_init(struct bch_fs *);
|
||||
|
||||
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type)
|
||||
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
|
||||
bool data)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_OPT_NONE:
|
||||
return BCH_CSUM_NONE;
|
||||
case BCH_CSUM_OPT_CRC32C:
|
||||
return BCH_CSUM_CRC32C;
|
||||
return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
|
||||
case BCH_CSUM_OPT_CRC64:
|
||||
return BCH_CSUM_CRC64;
|
||||
return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
@ -70,7 +79,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
|
||||
? BCH_CSUM_CHACHA20_POLY1305_128
|
||||
: BCH_CSUM_CHACHA20_POLY1305_80;
|
||||
|
||||
return bch2_csum_opt_to_type(c->opts.data_checksum);
|
||||
return bch2_csum_opt_to_type(c->opts.data_checksum, true);
|
||||
}
|
||||
|
||||
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
|
||||
@ -78,7 +87,7 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
|
||||
if (c->sb.encryption_type)
|
||||
return BCH_CSUM_CHACHA20_POLY1305_128;
|
||||
|
||||
return bch2_csum_opt_to_type(c->opts.metadata_checksum);
|
||||
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
|
||||
}
|
||||
|
||||
static inline enum bch_compression_type
|
||||
@ -134,6 +143,21 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
|
||||
return nonce;
|
||||
}
|
||||
|
||||
static inline struct nonce extent_nonce(struct bversion version,
|
||||
struct bch_extent_crc_unpacked crc)
|
||||
{
|
||||
unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
|
||||
struct nonce nonce = (struct nonce) {{
|
||||
[0] = cpu_to_le32(size << 22),
|
||||
[1] = cpu_to_le32(version.lo),
|
||||
[2] = cpu_to_le32(version.lo >> 32),
|
||||
[3] = cpu_to_le32(version.hi|
|
||||
(crc.compression_type << 24))^BCH_NONCE_EXTENT,
|
||||
}};
|
||||
|
||||
return nonce_add(nonce, crc.nonce << 9);
|
||||
}
|
||||
|
||||
static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
|
||||
{
|
||||
return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "bcachefs.h"
|
||||
#include "checksum.h"
|
||||
#include "compress.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
@ -145,11 +146,11 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
|
||||
}
|
||||
|
||||
static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
void *dst_data, struct bch_extent_crc128 crc)
|
||||
void *dst_data, struct bch_extent_crc_unpacked crc)
|
||||
{
|
||||
struct bbuf src_data = { NULL };
|
||||
size_t src_len = src->bi_iter.bi_size;
|
||||
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
|
||||
size_t dst_len = crc.uncompressed_size << 9;
|
||||
int ret;
|
||||
|
||||
src_data = bio_map_or_bounce(c, src, READ);
|
||||
@ -212,65 +213,58 @@ err:
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
|
||||
unsigned live_data_sectors,
|
||||
struct bch_extent_crc128 crc)
|
||||
struct bch_extent_crc_unpacked *crc)
|
||||
{
|
||||
struct bbuf dst_data = { NULL };
|
||||
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
|
||||
int ret = -ENOMEM;
|
||||
struct bbuf data = { NULL };
|
||||
size_t dst_len = crc->uncompressed_size << 9;
|
||||
|
||||
BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
|
||||
/* bio must own its pages: */
|
||||
BUG_ON(!bio->bi_vcnt);
|
||||
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
|
||||
|
||||
if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
|
||||
crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
|
||||
if (crc->uncompressed_size > c->sb.encoded_extent_max ||
|
||||
crc->compressed_size > c->sb.encoded_extent_max) {
|
||||
bch_err(c, "error rewriting existing data: extent too big");
|
||||
return -EIO;
|
||||
|
||||
dst_data = __bounce_alloc(c, dst_len, WRITE);
|
||||
|
||||
ret = __bio_uncompress(c, bio, dst_data.b, crc);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
|
||||
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
|
||||
|
||||
bv->bv_page = alloc_page(GFP_NOIO);
|
||||
if (!bv->bv_page)
|
||||
goto use_mempool;
|
||||
|
||||
bv->bv_len = PAGE_SIZE;
|
||||
bv->bv_offset = 0;
|
||||
bio->bi_vcnt++;
|
||||
}
|
||||
|
||||
bio->bi_iter.bi_size = live_data_sectors << 9;
|
||||
copy_data:
|
||||
memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9));
|
||||
err:
|
||||
bio_unmap_or_unbounce(c, dst_data);
|
||||
return ret;
|
||||
use_mempool:
|
||||
/*
|
||||
* We already allocated from mempool, we can't allocate from it again
|
||||
* without freeing the pages we already allocated or else we could
|
||||
* deadlock:
|
||||
*/
|
||||
data = __bounce_alloc(c, dst_len, WRITE);
|
||||
|
||||
bch2_bio_free_pages_pool(c, bio);
|
||||
bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
|
||||
goto copy_data;
|
||||
if (__bio_uncompress(c, bio, data.b, *crc)) {
|
||||
bch_err(c, "error rewriting existing data: decompression error");
|
||||
bio_unmap_or_unbounce(c, data);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* might have to free existing pages and retry allocation from mempool -
|
||||
* do this _after_ decompressing:
|
||||
*/
|
||||
bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
|
||||
|
||||
memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
|
||||
|
||||
crc->csum_type = 0;
|
||||
crc->compression_type = 0;
|
||||
crc->compressed_size = crc->live_size;
|
||||
crc->uncompressed_size = crc->live_size;
|
||||
crc->offset = 0;
|
||||
crc->csum = (struct bch_csum) { 0, 0 };
|
||||
|
||||
bio_unmap_or_unbounce(c, data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
struct bio *dst, struct bvec_iter dst_iter,
|
||||
struct bch_extent_crc128 crc)
|
||||
struct bch_extent_crc_unpacked crc)
|
||||
{
|
||||
struct bbuf dst_data = { NULL };
|
||||
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
|
||||
size_t dst_len = crc.uncompressed_size << 9;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
|
||||
crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
|
||||
if (crc.uncompressed_size > c->sb.encoded_extent_max ||
|
||||
crc.compressed_size > c->sb.encoded_extent_max)
|
||||
return -EIO;
|
||||
|
||||
dst_data = dst_len == dst_iter.bi_size
|
||||
@ -288,21 +282,25 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __bio_compress(struct bch_fs *c,
|
||||
struct bio *dst, size_t *dst_len,
|
||||
struct bio *src, size_t *src_len,
|
||||
unsigned *compression_type)
|
||||
static unsigned __bio_compress(struct bch_fs *c,
|
||||
struct bio *dst, size_t *dst_len,
|
||||
struct bio *src, size_t *src_len,
|
||||
unsigned compression_type)
|
||||
{
|
||||
struct bbuf src_data = { NULL }, dst_data = { NULL };
|
||||
unsigned pad;
|
||||
int ret = 0;
|
||||
|
||||
/* If it's only one block, don't bother trying to compress: */
|
||||
if (bio_sectors(src) <= c->opts.block_size)
|
||||
goto err;
|
||||
|
||||
dst_data = bio_map_or_bounce(c, dst, WRITE);
|
||||
src_data = bio_map_or_bounce(c, src, READ);
|
||||
|
||||
switch (*compression_type) {
|
||||
switch (compression_type) {
|
||||
case BCH_COMPRESSION_LZ4_OLD:
|
||||
*compression_type = BCH_COMPRESSION_LZ4;
|
||||
compression_type = BCH_COMPRESSION_LZ4;
|
||||
|
||||
case BCH_COMPRESSION_LZ4: {
|
||||
void *workspace;
|
||||
@ -403,19 +401,24 @@ zlib_err:
|
||||
|
||||
if (dst_data.type != BB_NONE)
|
||||
memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
|
||||
|
||||
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
|
||||
BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
|
||||
BUG_ON(*dst_len & (block_bytes(c) - 1));
|
||||
BUG_ON(*src_len & (block_bytes(c) - 1));
|
||||
out:
|
||||
bio_unmap_or_unbounce(c, src_data);
|
||||
bio_unmap_or_unbounce(c, dst_data);
|
||||
return ret;
|
||||
return compression_type;
|
||||
err:
|
||||
ret = -1;
|
||||
compression_type = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
void bch2_bio_compress(struct bch_fs *c,
|
||||
struct bio *dst, size_t *dst_len,
|
||||
struct bio *src, size_t *src_len,
|
||||
unsigned *compression_type)
|
||||
unsigned bch2_bio_compress(struct bch_fs *c,
|
||||
struct bio *dst, size_t *dst_len,
|
||||
struct bio *src, size_t *src_len,
|
||||
unsigned compression_type)
|
||||
{
|
||||
unsigned orig_dst = dst->bi_iter.bi_size;
|
||||
unsigned orig_src = src->bi_iter.bi_size;
|
||||
@ -423,29 +426,15 @@ void bch2_bio_compress(struct bch_fs *c,
|
||||
/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
|
||||
src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
|
||||
c->sb.encoded_extent_max << 9);
|
||||
|
||||
/* Don't generate a bigger output than input: */
|
||||
dst->bi_iter.bi_size =
|
||||
min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
|
||||
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
|
||||
|
||||
/* If it's only one block, don't bother trying to compress: */
|
||||
if (*compression_type != BCH_COMPRESSION_NONE &&
|
||||
bio_sectors(src) > c->opts.block_size &&
|
||||
!__bio_compress(c, dst, dst_len, src, src_len, compression_type))
|
||||
goto out;
|
||||
compression_type =
|
||||
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
|
||||
|
||||
/* If compressing failed (didn't get smaller), just copy: */
|
||||
*compression_type = BCH_COMPRESSION_NONE;
|
||||
*dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
|
||||
bio_copy_data(dst, src);
|
||||
out:
|
||||
dst->bi_iter.bi_size = orig_dst;
|
||||
src->bi_iter.bi_size = orig_src;
|
||||
|
||||
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
|
||||
BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
|
||||
BUG_ON(*dst_len & (block_bytes(c) - 1));
|
||||
BUG_ON(*src_len & (block_bytes(c) - 1));
|
||||
return compression_type;
|
||||
}
|
||||
|
||||
/* doesn't write superblock: */
|
||||
|
@ -1,12 +1,14 @@
|
||||
#ifndef _BCACHEFS_COMPRESS_H
|
||||
#define _BCACHEFS_COMPRESS_H
|
||||
|
||||
#include "extents_types.h"
|
||||
|
||||
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
|
||||
unsigned, struct bch_extent_crc128);
|
||||
struct bch_extent_crc_unpacked *);
|
||||
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
|
||||
struct bvec_iter, struct bch_extent_crc128);
|
||||
void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
|
||||
struct bio *, size_t *, unsigned *);
|
||||
struct bvec_iter, struct bch_extent_crc_unpacked);
|
||||
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
|
||||
struct bio *, size_t *, unsigned);
|
||||
|
||||
int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
|
||||
void bch2_fs_compress_exit(struct bch_fs *);
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "super-io.h"
|
||||
#include "util.h"
|
||||
#include "xattr.h"
|
||||
|
||||
#include <trace/events/bcachefs.h>
|
||||
@ -155,6 +156,44 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
|
||||
return nr_ptrs;
|
||||
}
|
||||
|
||||
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_extent e;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
unsigned ret = 0;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
e = bkey_s_c_to_extent(k);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (!ptr->cached &&
|
||||
crc.compression_type != BCH_COMPRESSION_NONE &&
|
||||
crc.compressed_size < crc.live_size)
|
||||
ret = max_t(unsigned, ret, crc.compressed_size);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
|
||||
struct bch_extent_ptr m, u64 offset)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (ptr->dev == m.dev &&
|
||||
ptr->gen == m.gen &&
|
||||
(s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
|
||||
(s64) m.offset - offset)
|
||||
return ptr;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Doesn't cleanup redundant crcs */
|
||||
void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
|
||||
{
|
||||
@ -186,24 +225,30 @@ found:
|
||||
bch2_extent_drop_ptr(e, ptr);
|
||||
}
|
||||
|
||||
/* returns true if equal */
|
||||
static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
|
||||
static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
|
||||
struct bch_extent_crc_unpacked n)
|
||||
{
|
||||
return extent_crc_type(l) == extent_crc_type(r) &&
|
||||
!memcmp(l, r, extent_entry_bytes(to_entry(l)));
|
||||
return !u.compression_type &&
|
||||
u.csum_type &&
|
||||
u.uncompressed_size > u.live_size &&
|
||||
bch2_csum_type_is_encryption(u.csum_type) ==
|
||||
bch2_csum_type_is_encryption(n.csum_type);
|
||||
}
|
||||
|
||||
/* Increment pointers after @crc by crc's offset until the next crc entry: */
|
||||
void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
|
||||
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
|
||||
struct bch_extent_crc_unpacked n)
|
||||
{
|
||||
union bch_extent_entry *entry;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
const union bch_extent_entry *i;
|
||||
|
||||
extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
|
||||
if (!extent_entry_is_ptr(entry))
|
||||
return;
|
||||
if (!n.csum_type)
|
||||
return false;
|
||||
|
||||
entry->ptr.offset += crc_offset(crc);
|
||||
}
|
||||
extent_for_each_crc(e, crc, i)
|
||||
if (can_narrow_crc(crc, n))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -214,96 +259,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr
|
||||
* not compressed, we can modify them to point to only the data that is
|
||||
* currently live (so that readers won't have to bounce) while we've got the
|
||||
* checksum we need:
|
||||
*
|
||||
* XXX: to guard against data being corrupted while in memory, instead of
|
||||
* recomputing the checksum here, it would be better in the read path to instead
|
||||
* of computing the checksum of the entire extent:
|
||||
*
|
||||
* | extent |
|
||||
*
|
||||
* compute the checksums of the live and dead data separately
|
||||
* | dead data || live data || dead data |
|
||||
*
|
||||
* and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
|
||||
* use crc_live here (that we verified was correct earlier)
|
||||
*
|
||||
* note: doesn't work with encryption
|
||||
*/
|
||||
void bch2_extent_narrow_crcs(struct bkey_s_extent e)
|
||||
bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
|
||||
struct bch_extent_crc_unpacked n)
|
||||
{
|
||||
union bch_extent_crc *crc;
|
||||
bool have_wide = false, have_narrow = false;
|
||||
struct bch_csum csum = { 0 };
|
||||
unsigned csum_type = 0;
|
||||
struct bch_extent_crc_unpacked u;
|
||||
struct bch_extent_ptr *ptr;
|
||||
union bch_extent_entry *i;
|
||||
|
||||
extent_for_each_crc(e, crc) {
|
||||
if (crc_compression_type(crc) ||
|
||||
bch2_csum_type_is_encryption(crc_csum_type(crc)))
|
||||
continue;
|
||||
|
||||
if (crc_uncompressed_size(e.k, crc) != e.k->size) {
|
||||
have_wide = true;
|
||||
} else {
|
||||
have_narrow = true;
|
||||
csum = crc_csum(crc);
|
||||
csum_type = crc_csum_type(crc);
|
||||
}
|
||||
}
|
||||
|
||||
if (!have_wide || !have_narrow)
|
||||
return;
|
||||
|
||||
extent_for_each_crc(e, crc) {
|
||||
if (crc_compression_type(crc))
|
||||
continue;
|
||||
|
||||
if (crc_uncompressed_size(e.k, crc) != e.k->size) {
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
BUG();
|
||||
case BCH_EXTENT_CRC32:
|
||||
if (bch_crc_bytes[csum_type] > 4)
|
||||
continue;
|
||||
|
||||
bch2_extent_crc_narrow_pointers(e, crc);
|
||||
crc->crc32._compressed_size = e.k->size - 1;
|
||||
crc->crc32._uncompressed_size = e.k->size - 1;
|
||||
crc->crc32.offset = 0;
|
||||
crc->crc32.csum_type = csum_type;
|
||||
crc->crc32.csum = csum.lo;
|
||||
break;
|
||||
case BCH_EXTENT_CRC64:
|
||||
if (bch_crc_bytes[csum_type] > 10)
|
||||
continue;
|
||||
|
||||
bch2_extent_crc_narrow_pointers(e, crc);
|
||||
crc->crc64._compressed_size = e.k->size - 1;
|
||||
crc->crc64._uncompressed_size = e.k->size - 1;
|
||||
crc->crc64.offset = 0;
|
||||
crc->crc64.csum_type = csum_type;
|
||||
crc->crc64.csum_lo = csum.lo;
|
||||
crc->crc64.csum_hi = csum.hi;
|
||||
break;
|
||||
case BCH_EXTENT_CRC128:
|
||||
if (bch_crc_bytes[csum_type] > 16)
|
||||
continue;
|
||||
|
||||
bch2_extent_crc_narrow_pointers(e, crc);
|
||||
crc->crc128._compressed_size = e.k->size - 1;
|
||||
crc->crc128._uncompressed_size = e.k->size - 1;
|
||||
crc->crc128.offset = 0;
|
||||
crc->crc128.csum_type = csum_type;
|
||||
crc->crc128.csum = csum;
|
||||
/* Find a checksum entry that covers only live data: */
|
||||
if (!n.csum_type)
|
||||
extent_for_each_crc(extent_i_to_s(e), u, i)
|
||||
if (!u.compression_type &&
|
||||
u.csum_type &&
|
||||
u.live_size == u.uncompressed_size) {
|
||||
n = u;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
|
||||
return false;
|
||||
|
||||
BUG_ON(n.compression_type);
|
||||
BUG_ON(n.offset);
|
||||
BUG_ON(n.live_size != e->k.size);
|
||||
|
||||
bch2_extent_crc_append(e, n);
|
||||
restart_narrow_pointers:
|
||||
extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
|
||||
if (can_narrow_crc(u, n)) {
|
||||
ptr->offset += u.offset;
|
||||
extent_ptr_append(e, *ptr);
|
||||
__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
|
||||
goto restart_narrow_pointers;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
|
||||
{
|
||||
union bch_extent_entry *entry = e.v->start;
|
||||
union bch_extent_crc *crc, *prev = NULL;
|
||||
struct bch_extent_crc_unpacked u, prev_u;
|
||||
|
||||
while (entry != extent_entry_last(e)) {
|
||||
union bch_extent_entry *next = extent_entry_next(entry);
|
||||
@ -313,6 +312,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
|
||||
goto next;
|
||||
|
||||
crc = entry_to_crc(entry);
|
||||
u = bch2_extent_crc_unpack(e.k, crc);
|
||||
|
||||
if (next == extent_entry_last(e)) {
|
||||
/* crc entry with no pointers after it: */
|
||||
@ -324,20 +324,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
|
||||
goto drop;
|
||||
}
|
||||
|
||||
if (prev && crc_cmp(crc, prev)) {
|
||||
if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
|
||||
/* identical to previous crc entry: */
|
||||
goto drop;
|
||||
}
|
||||
|
||||
if (!prev &&
|
||||
!crc_csum_type(crc) &&
|
||||
!crc_compression_type(crc)) {
|
||||
!u.csum_type &&
|
||||
!u.compression_type) {
|
||||
/* null crc entry: */
|
||||
bch2_extent_crc_narrow_pointers(e, crc);
|
||||
union bch_extent_entry *e2;
|
||||
|
||||
extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
|
||||
if (!extent_entry_is_ptr(e2))
|
||||
break;
|
||||
|
||||
e2->ptr.offset += u.offset;
|
||||
}
|
||||
goto drop;
|
||||
}
|
||||
|
||||
prev = crc;
|
||||
prev_u = u;
|
||||
next:
|
||||
entry = next;
|
||||
continue;
|
||||
@ -453,7 +461,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
|
||||
{
|
||||
char *out = buf, *end = buf + size;
|
||||
const union bch_extent_entry *entry;
|
||||
const union bch_extent_crc *crc;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_dev *ca;
|
||||
bool first = true;
|
||||
@ -468,13 +476,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
|
||||
case BCH_EXTENT_ENTRY_crc32:
|
||||
case BCH_EXTENT_ENTRY_crc64:
|
||||
case BCH_EXTENT_ENTRY_crc128:
|
||||
crc = entry_to_crc(entry);
|
||||
crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
|
||||
|
||||
p("crc: c_size %u size %u offset %u csum %u compress %u",
|
||||
crc_compressed_size(e.k, crc),
|
||||
crc_uncompressed_size(e.k, crc),
|
||||
crc_offset(crc), crc_csum_type(crc),
|
||||
crc_compression_type(crc));
|
||||
p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
|
||||
crc.compressed_size,
|
||||
crc.uncompressed_size,
|
||||
crc.offset, crc.nonce,
|
||||
crc.csum_type,
|
||||
crc.compression_type);
|
||||
break;
|
||||
case BCH_EXTENT_ENTRY_ptr:
|
||||
ptr = entry_to_ptr(entry);
|
||||
@ -499,13 +508,24 @@ out:
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
static inline bool dev_latency_better(struct bch_dev *dev1,
|
||||
struct bch_dev *dev2)
|
||||
{
|
||||
unsigned l1 = atomic_read(&dev1->latency[READ]);
|
||||
unsigned l2 = atomic_read(&dev2->latency[READ]);
|
||||
|
||||
/* Pick at random, biased in favor of the faster device: */
|
||||
|
||||
return bch2_rand_range(l1 + l2) > l1;
|
||||
}
|
||||
|
||||
static void extent_pick_read_device(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *pick)
|
||||
{
|
||||
const union bch_extent_crc *crc;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
struct bch_dev *ca = c->devs[ptr->dev];
|
||||
@ -516,12 +536,18 @@ static void extent_pick_read_device(struct bch_fs *c,
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
|
||||
continue;
|
||||
|
||||
if (avoid && test_bit(ca->dev_idx, avoid->d))
|
||||
continue;
|
||||
if (avoid) {
|
||||
if (test_bit(ca->dev_idx, avoid->d))
|
||||
continue;
|
||||
|
||||
if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
|
||||
continue;
|
||||
if (pick->ca &&
|
||||
test_bit(pick->ca->dev_idx, avoid->d))
|
||||
goto use;
|
||||
}
|
||||
|
||||
if (pick->ca && !dev_latency_better(ca, pick->ca))
|
||||
continue;
|
||||
use:
|
||||
if (!percpu_ref_tryget(&ca->io_ref))
|
||||
continue;
|
||||
|
||||
@ -530,11 +556,9 @@ static void extent_pick_read_device(struct bch_fs *c,
|
||||
|
||||
*pick = (struct extent_pick_ptr) {
|
||||
.ptr = *ptr,
|
||||
.crc = crc,
|
||||
.ca = ca,
|
||||
};
|
||||
|
||||
if (e.k->size)
|
||||
pick->crc = crc_to_128(e.k, crc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -557,14 +581,17 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const union bch_extent_entry *entry;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
const union bch_extent_crc *crc;
|
||||
const char *reason;
|
||||
|
||||
extent_for_each_entry(e, entry)
|
||||
extent_for_each_entry(e, entry) {
|
||||
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
|
||||
return "invalid extent entry type";
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
if (extent_entry_is_crc(entry))
|
||||
return "has crc field";
|
||||
}
|
||||
|
||||
extent_for_each_ptr(e, ptr) {
|
||||
reason = extent_ptr_invalid(c, e, ptr,
|
||||
c->opts.btree_node_size,
|
||||
true);
|
||||
@ -572,9 +599,6 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
|
||||
return reason;
|
||||
}
|
||||
|
||||
if (crc)
|
||||
return "has crc field";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -699,28 +723,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
|
||||
__set_bkey_deleted(k.k);
|
||||
else if (bkey_extent_is_data(k.k)) {
|
||||
struct bkey_s_extent e = bkey_s_to_extent(k);
|
||||
struct bch_extent_ptr *ptr;
|
||||
union bch_extent_crc *crc, *prev_crc = NULL;
|
||||
union bch_extent_entry *entry;
|
||||
bool seen_crc = false;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
ptr->offset += e.k->size - len;
|
||||
extent_for_each_entry(e, entry) {
|
||||
switch (extent_entry_type(entry)) {
|
||||
case BCH_EXTENT_ENTRY_ptr:
|
||||
if (!seen_crc)
|
||||
entry->ptr.offset += e.k->size - len;
|
||||
break;
|
||||
case BCH_EXTENT_CRC32:
|
||||
if (prev_crc != crc)
|
||||
crc->crc32.offset += e.k->size - len;
|
||||
case BCH_EXTENT_ENTRY_crc32:
|
||||
entry->crc32.offset += e.k->size - len;
|
||||
break;
|
||||
case BCH_EXTENT_CRC64:
|
||||
if (prev_crc != crc)
|
||||
crc->crc64.offset += e.k->size - len;
|
||||
case BCH_EXTENT_ENTRY_crc64:
|
||||
entry->crc64.offset += e.k->size - len;
|
||||
break;
|
||||
case BCH_EXTENT_CRC128:
|
||||
if (prev_crc != crc)
|
||||
crc->crc128.offset += e.k->size - len;
|
||||
case BCH_EXTENT_ENTRY_crc128:
|
||||
entry->crc128.offset += e.k->size - len;
|
||||
break;
|
||||
}
|
||||
prev_crc = crc;
|
||||
|
||||
if (extent_entry_is_crc(entry))
|
||||
seen_crc = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -989,7 +1013,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
|
||||
return;
|
||||
|
||||
bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
|
||||
&s->stats, s->trans->journal_res.seq);
|
||||
&s->stats, s->trans->journal_res.seq, 0);
|
||||
}
|
||||
|
||||
static void bch2_subtract_sectors(struct extent_insert_state *s,
|
||||
@ -1123,7 +1147,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
|
||||
|
||||
if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
|
||||
bkey_cmp(s->committed, insert->k.p) &&
|
||||
bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
|
||||
bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
|
||||
/* XXX: possibly need to increase our reservation? */
|
||||
bch2_cut_subtract_back(s, s->committed,
|
||||
bkey_i_to_s(&split.k));
|
||||
@ -1152,46 +1176,24 @@ done:
|
||||
s->trans->did_work = true;
|
||||
}
|
||||
|
||||
static enum extent_insert_hook_ret
|
||||
static enum btree_insert_ret
|
||||
__extent_insert_advance_pos(struct extent_insert_state *s,
|
||||
struct bpos next_pos,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct extent_insert_hook *hook = s->trans->hook;
|
||||
enum extent_insert_hook_ret ret;
|
||||
#if 0
|
||||
/*
|
||||
* Currently disabled for encryption - broken with fcollapse. Will have
|
||||
* to reenable when versions are exposed for send/receive - versions
|
||||
* will have to be monotonic then:
|
||||
*/
|
||||
if (k.k && k.k->size &&
|
||||
!bversion_zero(s->insert->k->k.version) &&
|
||||
bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
|
||||
ret = BTREE_HOOK_NO_INSERT;
|
||||
} else
|
||||
#endif
|
||||
enum btree_insert_ret ret;
|
||||
|
||||
if (hook)
|
||||
ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
|
||||
else
|
||||
ret = BTREE_HOOK_DO_INSERT;
|
||||
ret = BTREE_INSERT_OK;
|
||||
|
||||
EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
|
||||
|
||||
switch (ret) {
|
||||
case BTREE_HOOK_DO_INSERT:
|
||||
break;
|
||||
case BTREE_HOOK_NO_INSERT:
|
||||
extent_insert_committed(s);
|
||||
bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
|
||||
if (ret == BTREE_INSERT_OK)
|
||||
s->committed = next_pos;
|
||||
|
||||
bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
|
||||
break;
|
||||
case BTREE_HOOK_RESTART_TRANS:
|
||||
return ret;
|
||||
}
|
||||
|
||||
s->committed = next_pos;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1199,39 +1201,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
|
||||
* Update iter->pos, marking how much of @insert we've processed, and call hook
|
||||
* fn:
|
||||
*/
|
||||
static enum extent_insert_hook_ret
|
||||
static enum btree_insert_ret
|
||||
extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
|
||||
{
|
||||
struct btree *b = s->insert->iter->nodes[0];
|
||||
struct bpos next_pos = bpos_min(s->insert->k->k.p,
|
||||
k.k ? k.k->p : b->key.k.p);
|
||||
enum btree_insert_ret ret;
|
||||
|
||||
if (race_fault())
|
||||
return BTREE_INSERT_NEED_TRAVERSE;
|
||||
|
||||
/* hole? */
|
||||
if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
|
||||
bool have_uncommitted = bkey_cmp(s->committed,
|
||||
bkey_start_pos(&s->insert->k->k)) > 0;
|
||||
|
||||
switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
|
||||
bkey_s_c_null)) {
|
||||
case BTREE_HOOK_DO_INSERT:
|
||||
break;
|
||||
case BTREE_HOOK_NO_INSERT:
|
||||
/*
|
||||
* we had to split @insert and insert the committed
|
||||
* part - need to bail out and recheck journal
|
||||
* reservation/btree node before we advance pos past @k:
|
||||
*/
|
||||
if (have_uncommitted)
|
||||
return BTREE_HOOK_NO_INSERT;
|
||||
break;
|
||||
case BTREE_HOOK_RESTART_TRANS:
|
||||
return BTREE_HOOK_RESTART_TRANS;
|
||||
}
|
||||
ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
|
||||
bkey_s_c_null);
|
||||
if (ret != BTREE_INSERT_OK)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* avoid redundant calls to hook fn: */
|
||||
if (!bkey_cmp(s->committed, next_pos))
|
||||
return BTREE_HOOK_DO_INSERT;
|
||||
return BTREE_INSERT_OK;
|
||||
|
||||
return __extent_insert_advance_pos(s, next_pos, k);
|
||||
}
|
||||
@ -1245,7 +1236,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
|
||||
unsigned sectors;
|
||||
|
||||
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
|
||||
(sectors = bkey_extent_is_compressed(k))) {
|
||||
(sectors = bch2_extent_is_compressed(k))) {
|
||||
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
|
||||
|
||||
if (s->trans->flags & BTREE_INSERT_NOFAIL)
|
||||
@ -1277,6 +1268,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
|
||||
struct btree_iter *iter = s->insert->iter;
|
||||
struct btree *b = iter->nodes[0];
|
||||
struct btree_node_iter *node_iter = &iter->node_iters[0];
|
||||
enum btree_insert_ret ret;
|
||||
|
||||
switch (overlap) {
|
||||
case BCH_EXTENT_OVERLAP_FRONT:
|
||||
@ -1322,9 +1314,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
|
||||
k.k->p = orig_pos;
|
||||
extent_save(b, node_iter, _k, k.k);
|
||||
|
||||
if (extent_insert_advance_pos(s, k.s_c) ==
|
||||
BTREE_HOOK_RESTART_TRANS)
|
||||
return BTREE_INSERT_NEED_TRAVERSE;
|
||||
ret = extent_insert_advance_pos(s, k.s_c);
|
||||
if (ret != BTREE_INSERT_OK)
|
||||
return ret;
|
||||
|
||||
extent_insert_committed(s);
|
||||
/*
|
||||
@ -1420,15 +1412,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
|
||||
if (ret != BTREE_INSERT_OK)
|
||||
goto stop;
|
||||
|
||||
switch (extent_insert_advance_pos(s, k.s_c)) {
|
||||
case BTREE_HOOK_DO_INSERT:
|
||||
break;
|
||||
case BTREE_HOOK_NO_INSERT:
|
||||
continue;
|
||||
case BTREE_HOOK_RESTART_TRANS:
|
||||
ret = BTREE_INSERT_NEED_TRAVERSE;
|
||||
ret = extent_insert_advance_pos(s, k.s_c);
|
||||
if (ret)
|
||||
goto stop;
|
||||
}
|
||||
|
||||
s->do_journal = true;
|
||||
|
||||
@ -1469,10 +1455,9 @@ next:
|
||||
bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
|
||||
}
|
||||
|
||||
if (bkey_cmp(s->committed, insert->k.p) < 0 &&
|
||||
ret == BTREE_INSERT_OK &&
|
||||
extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
|
||||
ret = BTREE_INSERT_NEED_TRAVERSE;
|
||||
if (ret == BTREE_INSERT_OK &&
|
||||
bkey_cmp(s->committed, insert->k.p) < 0)
|
||||
ret = extent_insert_advance_pos(s, bkey_s_c_null);
|
||||
stop:
|
||||
extent_insert_committed(s);
|
||||
|
||||
@ -1594,18 +1579,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
|
||||
|
||||
/*
|
||||
* Only call advance pos & call hook for nonzero size extents:
|
||||
* If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
|
||||
* overlaps with @k:
|
||||
*/
|
||||
switch (extent_insert_advance_pos(&s, k.s_c)) {
|
||||
case BTREE_HOOK_DO_INSERT:
|
||||
break;
|
||||
case BTREE_HOOK_NO_INSERT:
|
||||
continue;
|
||||
case BTREE_HOOK_RESTART_TRANS:
|
||||
ret = BTREE_INSERT_NEED_TRAVERSE;
|
||||
ret = extent_insert_advance_pos(&s, k.s_c);
|
||||
if (ret != BTREE_INSERT_OK)
|
||||
goto stop;
|
||||
}
|
||||
|
||||
if (k.k->size &&
|
||||
(k.k->needs_whiteout || bset_written(b, bset(b, t))))
|
||||
@ -1623,10 +1600,9 @@ squash:
|
||||
goto stop;
|
||||
}
|
||||
|
||||
if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
|
||||
ret == BTREE_INSERT_OK &&
|
||||
extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
|
||||
ret = BTREE_INSERT_NEED_TRAVERSE;
|
||||
if (ret == BTREE_INSERT_OK &&
|
||||
bkey_cmp(s.committed, insert->k->k.p) < 0)
|
||||
ret = extent_insert_advance_pos(&s, bkey_s_c_null);
|
||||
stop:
|
||||
extent_insert_committed(&s);
|
||||
/*
|
||||
@ -1669,29 +1645,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c,
|
||||
case BCH_EXTENT_CACHED: {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const union bch_extent_entry *entry;
|
||||
const union bch_extent_crc *crc;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
unsigned size_ondisk = e.k->size;
|
||||
const char *reason;
|
||||
unsigned nonce = UINT_MAX;
|
||||
|
||||
extent_for_each_entry(e, entry) {
|
||||
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
|
||||
return "invalid extent entry type";
|
||||
|
||||
if (extent_entry_is_crc(entry)) {
|
||||
crc = entry_to_crc(entry);
|
||||
crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
|
||||
|
||||
if (crc_offset(crc) + e.k->size >
|
||||
crc_uncompressed_size(e.k, crc))
|
||||
if (crc.offset + e.k->size >
|
||||
crc.uncompressed_size)
|
||||
return "checksum offset + key size > uncompressed size";
|
||||
|
||||
size_ondisk = crc_compressed_size(e.k, crc);
|
||||
size_ondisk = crc.compressed_size;
|
||||
|
||||
if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
|
||||
if (!bch2_checksum_type_valid(c, crc.csum_type))
|
||||
return "invalid checksum type";
|
||||
|
||||
if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
|
||||
if (crc.compression_type >= BCH_COMPRESSION_NR)
|
||||
return "invalid compression type";
|
||||
|
||||
if (bch2_csum_type_is_encryption(crc.csum_type)) {
|
||||
if (nonce == UINT_MAX)
|
||||
nonce = crc.offset + crc.nonce;
|
||||
else if (nonce != crc.offset + crc.nonce)
|
||||
return "incorrect nonce";
|
||||
}
|
||||
} else {
|
||||
ptr = entry_to_ptr(entry);
|
||||
|
||||
@ -1864,102 +1848,75 @@ static unsigned PTR_TIER(struct bch_fs *c,
|
||||
}
|
||||
|
||||
static void bch2_extent_crc_init(union bch_extent_crc *crc,
|
||||
unsigned compressed_size,
|
||||
unsigned uncompressed_size,
|
||||
unsigned compression_type,
|
||||
unsigned nonce,
|
||||
struct bch_csum csum, unsigned csum_type)
|
||||
struct bch_extent_crc_unpacked new)
|
||||
{
|
||||
if (bch_crc_bytes[csum_type] <= 4 &&
|
||||
uncompressed_size <= CRC32_SIZE_MAX &&
|
||||
nonce <= CRC32_NONCE_MAX) {
|
||||
#define common_fields(_crc) \
|
||||
.csum_type = _crc.csum_type, \
|
||||
.compression_type = _crc.compression_type, \
|
||||
._compressed_size = _crc.compressed_size - 1, \
|
||||
._uncompressed_size = _crc.uncompressed_size - 1, \
|
||||
.offset = _crc.offset
|
||||
|
||||
if (bch_crc_bytes[new.csum_type] <= 4 &&
|
||||
new.uncompressed_size <= CRC32_SIZE_MAX &&
|
||||
new.nonce <= CRC32_NONCE_MAX) {
|
||||
crc->crc32 = (struct bch_extent_crc32) {
|
||||
.type = 1 << BCH_EXTENT_ENTRY_crc32,
|
||||
._compressed_size = compressed_size - 1,
|
||||
._uncompressed_size = uncompressed_size - 1,
|
||||
.offset = 0,
|
||||
.compression_type = compression_type,
|
||||
.csum_type = csum_type,
|
||||
.csum = *((__le32 *) &csum.lo),
|
||||
common_fields(new),
|
||||
.csum = *((__le32 *) &new.csum.lo),
|
||||
};
|
||||
return;
|
||||
}
|
||||
|
||||
if (bch_crc_bytes[csum_type] <= 10 &&
|
||||
uncompressed_size <= CRC64_SIZE_MAX &&
|
||||
nonce <= CRC64_NONCE_MAX) {
|
||||
if (bch_crc_bytes[new.csum_type] <= 10 &&
|
||||
new.uncompressed_size <= CRC64_SIZE_MAX &&
|
||||
new.nonce <= CRC64_NONCE_MAX) {
|
||||
crc->crc64 = (struct bch_extent_crc64) {
|
||||
.type = 1 << BCH_EXTENT_ENTRY_crc64,
|
||||
._compressed_size = compressed_size - 1,
|
||||
._uncompressed_size = uncompressed_size - 1,
|
||||
.offset = 0,
|
||||
.nonce = nonce,
|
||||
.compression_type = compression_type,
|
||||
.csum_type = csum_type,
|
||||
.csum_lo = csum.lo,
|
||||
.csum_hi = *((__le16 *) &csum.hi),
|
||||
common_fields(new),
|
||||
.nonce = new.nonce,
|
||||
.csum_lo = new.csum.lo,
|
||||
.csum_hi = *((__le16 *) &new.csum.hi),
|
||||
};
|
||||
return;
|
||||
}
|
||||
|
||||
if (bch_crc_bytes[csum_type] <= 16 &&
|
||||
uncompressed_size <= CRC128_SIZE_MAX &&
|
||||
nonce <= CRC128_NONCE_MAX) {
|
||||
if (bch_crc_bytes[new.csum_type] <= 16 &&
|
||||
new.uncompressed_size <= CRC128_SIZE_MAX &&
|
||||
new.nonce <= CRC128_NONCE_MAX) {
|
||||
crc->crc128 = (struct bch_extent_crc128) {
|
||||
.type = 1 << BCH_EXTENT_ENTRY_crc128,
|
||||
._compressed_size = compressed_size - 1,
|
||||
._uncompressed_size = uncompressed_size - 1,
|
||||
.offset = 0,
|
||||
.nonce = nonce,
|
||||
.compression_type = compression_type,
|
||||
.csum_type = csum_type,
|
||||
.csum = csum,
|
||||
common_fields(new),
|
||||
.nonce = new.nonce,
|
||||
.csum = new.csum,
|
||||
};
|
||||
return;
|
||||
}
|
||||
|
||||
#undef common_fields
|
||||
BUG();
|
||||
}
|
||||
|
||||
void bch2_extent_crc_append(struct bkey_i_extent *e,
|
||||
unsigned compressed_size,
|
||||
unsigned uncompressed_size,
|
||||
unsigned compression_type,
|
||||
unsigned nonce,
|
||||
struct bch_csum csum, unsigned csum_type)
|
||||
struct bch_extent_crc_unpacked new)
|
||||
{
|
||||
union bch_extent_crc *crc;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
const union bch_extent_entry *i;
|
||||
|
||||
BUG_ON(compressed_size > uncompressed_size);
|
||||
BUG_ON(uncompressed_size != e->k.size);
|
||||
BUG_ON(!compressed_size || !uncompressed_size);
|
||||
BUG_ON(new.compressed_size > new.uncompressed_size);
|
||||
BUG_ON(new.live_size != e->k.size);
|
||||
BUG_ON(!new.compressed_size || !new.uncompressed_size);
|
||||
|
||||
/*
|
||||
* Look up the last crc entry, so we can check if we need to add
|
||||
* another:
|
||||
*/
|
||||
extent_for_each_crc(extent_i_to_s(e), crc)
|
||||
extent_for_each_crc(extent_i_to_s(e), crc, i)
|
||||
;
|
||||
|
||||
if (!crc && !csum_type && !compression_type)
|
||||
if (!memcmp(&crc, &new, sizeof(crc)))
|
||||
return;
|
||||
|
||||
if (crc &&
|
||||
crc_compressed_size(&e->k, crc) == compressed_size &&
|
||||
crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
|
||||
crc_offset(crc) == 0 &&
|
||||
crc_nonce(crc) == nonce &&
|
||||
crc_csum_type(crc) == csum_type &&
|
||||
crc_compression_type(crc) == compression_type &&
|
||||
crc_csum(crc).lo == csum.lo &&
|
||||
crc_csum(crc).hi == csum.hi)
|
||||
return;
|
||||
|
||||
bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
|
||||
compressed_size,
|
||||
uncompressed_size,
|
||||
compression_type,
|
||||
nonce, csum, csum_type);
|
||||
bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
|
||||
__extent_entry_push(e);
|
||||
}
|
||||
|
||||
@ -2011,16 +1968,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
|
||||
}
|
||||
|
||||
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
|
||||
struct bkey_s_extent e,
|
||||
unsigned nr_cached)
|
||||
struct bkey_s_extent e)
|
||||
{
|
||||
struct bch_extent_ptr *ptr;
|
||||
unsigned tier = 0, nr_cached = 0, nr_good = 0;
|
||||
bool have_higher_tier;
|
||||
unsigned tier = 0;
|
||||
|
||||
if (!nr_cached)
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached &&
|
||||
c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
|
||||
nr_good++;
|
||||
|
||||
if (nr_good <= c->opts.data_replicas)
|
||||
return;
|
||||
|
||||
nr_cached = nr_good - c->opts.data_replicas;
|
||||
|
||||
do {
|
||||
have_higher_tier = false;
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey.h"
|
||||
#include "io_types.h"
|
||||
#include "extents_types.h"
|
||||
|
||||
struct bch_fs;
|
||||
struct journal_res;
|
||||
@ -38,11 +38,17 @@ bch2_insert_fixup_extent(struct btree_insert *,
|
||||
struct btree_insert_entry *);
|
||||
|
||||
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
|
||||
void bch2_extent_mark_replicas_cached(struct bch_fs *,
|
||||
struct bkey_s_extent, unsigned);
|
||||
void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
|
||||
|
||||
const struct bch_extent_ptr *
|
||||
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
|
||||
|
||||
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
|
||||
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
|
||||
unsigned bch2_extent_is_compressed(struct bkey_s_c);
|
||||
|
||||
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
|
||||
struct bch_extent_ptr, u64);
|
||||
|
||||
static inline bool bkey_extent_is_data(const struct bkey *k)
|
||||
{
|
||||
@ -67,6 +73,12 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
|
||||
{
|
||||
return bkey_extent_is_allocation(k.k) &&
|
||||
!bch2_extent_is_compressed(k);
|
||||
}
|
||||
|
||||
static inline bool bkey_extent_is_cached(const struct bkey *k)
|
||||
{
|
||||
return k->type == BCH_EXTENT_CACHED;
|
||||
@ -170,6 +182,8 @@ union bch_extent_crc {
|
||||
(struct bch_extent_ptr *) (_entry)); \
|
||||
})
|
||||
|
||||
/* checksum entries: */
|
||||
|
||||
enum bch_extent_crc_type {
|
||||
BCH_EXTENT_CRC_NONE,
|
||||
BCH_EXTENT_CRC32,
|
||||
@ -208,6 +222,50 @@ __extent_crc_type(const union bch_extent_crc *crc)
|
||||
: __extent_crc_type((union bch_extent_crc *) _crc); \
|
||||
})
|
||||
|
||||
static inline struct bch_extent_crc_unpacked
|
||||
bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
|
||||
{
|
||||
#define common_fields(_crc) \
|
||||
.csum_type = _crc.csum_type, \
|
||||
.compression_type = _crc.compression_type, \
|
||||
.compressed_size = _crc._compressed_size + 1, \
|
||||
.uncompressed_size = _crc._uncompressed_size + 1, \
|
||||
.offset = _crc.offset, \
|
||||
.live_size = k->size
|
||||
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
.compressed_size = k->size,
|
||||
.uncompressed_size = k->size,
|
||||
.live_size = k->size,
|
||||
};
|
||||
case BCH_EXTENT_CRC32:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc32),
|
||||
.csum.lo = crc->crc32.csum,
|
||||
};
|
||||
case BCH_EXTENT_CRC64:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc64),
|
||||
.nonce = crc->crc64.nonce,
|
||||
.csum.lo = crc->crc64.csum_lo,
|
||||
.csum.hi = crc->crc64.csum_hi,
|
||||
};
|
||||
case BCH_EXTENT_CRC128:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc128),
|
||||
.nonce = crc->crc128.nonce,
|
||||
.csum = crc->crc128.csum,
|
||||
};
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
#undef common_fields
|
||||
}
|
||||
|
||||
/* Extent entry iteration: */
|
||||
|
||||
#define extent_entry_next(_entry) \
|
||||
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
|
||||
|
||||
@ -226,7 +284,7 @@ __extent_crc_type(const union bch_extent_crc *crc)
|
||||
|
||||
/* Iterate over crcs only: */
|
||||
|
||||
#define extent_crc_next(_e, _p) \
|
||||
#define __extent_crc_next(_e, _p) \
|
||||
({ \
|
||||
typeof(&(_e).v->start[0]) _entry = _p; \
|
||||
\
|
||||
@ -237,25 +295,41 @@ __extent_crc_type(const union bch_extent_crc *crc)
|
||||
entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \
|
||||
})
|
||||
|
||||
#define extent_for_each_crc(_e, _crc) \
|
||||
for ((_crc) = extent_crc_next(_e, (_e).v->start); \
|
||||
#define __extent_for_each_crc(_e, _crc) \
|
||||
for ((_crc) = __extent_crc_next(_e, (_e).v->start); \
|
||||
(_crc); \
|
||||
(_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
|
||||
(_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
|
||||
|
||||
#define extent_crc_next(_e, _crc, _iter) \
|
||||
({ \
|
||||
extent_for_each_entry_from(_e, _iter, _iter) \
|
||||
if (extent_entry_is_crc(_iter)) { \
|
||||
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
(_iter) < extent_entry_last(_e); \
|
||||
})
|
||||
|
||||
#define extent_for_each_crc(_e, _crc, _iter) \
|
||||
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
|
||||
(_iter) = (_e).v->start; \
|
||||
extent_crc_next(_e, _crc, _iter); \
|
||||
(_iter) = extent_entry_next(_iter))
|
||||
|
||||
/* Iterate over pointers, with crcs: */
|
||||
|
||||
#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \
|
||||
#define extent_ptr_crc_next(_e, _ptr, _crc) \
|
||||
({ \
|
||||
__label__ out; \
|
||||
typeof(&(_e).v->start[0]) _entry; \
|
||||
\
|
||||
extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \
|
||||
if (extent_entry_is_crc(_entry)) { \
|
||||
(_crc) = entry_to_crc(_entry); \
|
||||
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
|
||||
} else { \
|
||||
_ptr = entry_to_ptr(_entry); \
|
||||
if (_filter) \
|
||||
goto out; \
|
||||
goto out; \
|
||||
} \
|
||||
\
|
||||
_ptr = NULL; \
|
||||
@ -263,34 +337,25 @@ out: \
|
||||
_ptr; \
|
||||
})
|
||||
|
||||
#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \
|
||||
for ((_crc) = NULL, \
|
||||
(_ptr) = &(_e).v->start->ptr; \
|
||||
((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
|
||||
(_ptr)++)
|
||||
|
||||
#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
|
||||
extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
|
||||
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
|
||||
(_ptr) = &(_e).v->start->ptr; \
|
||||
((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \
|
||||
(_ptr)++)
|
||||
|
||||
/* Iterate over pointers only, and from a given position: */
|
||||
|
||||
#define extent_ptr_next_filter(_e, _ptr, _filter) \
|
||||
#define extent_ptr_next(_e, _ptr) \
|
||||
({ \
|
||||
typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \
|
||||
struct bch_extent_crc_unpacked _crc; \
|
||||
\
|
||||
extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \
|
||||
extent_ptr_crc_next(_e, _ptr, _crc); \
|
||||
})
|
||||
|
||||
#define extent_ptr_next(_e, _ptr) \
|
||||
extent_ptr_next_filter(_e, _ptr, true)
|
||||
|
||||
#define extent_for_each_ptr_filter(_e, _ptr, _filter) \
|
||||
for ((_ptr) = &(_e).v->start->ptr; \
|
||||
((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \
|
||||
(_ptr)++)
|
||||
|
||||
#define extent_for_each_ptr(_e, _ptr) \
|
||||
extent_for_each_ptr_filter(_e, _ptr, true)
|
||||
for ((_ptr) = &(_e).v->start->ptr; \
|
||||
((_ptr) = extent_ptr_next(_e, _ptr)); \
|
||||
(_ptr)++)
|
||||
|
||||
#define extent_ptr_prev(_e, _ptr) \
|
||||
({ \
|
||||
@ -315,8 +380,8 @@ out: \
|
||||
(_ptr); \
|
||||
(_ptr) = extent_ptr_prev(_e, _ptr))
|
||||
|
||||
void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
|
||||
unsigned, unsigned, struct bch_csum, unsigned);
|
||||
void bch2_extent_crc_append(struct bkey_i_extent *,
|
||||
struct bch_extent_crc_unpacked);
|
||||
|
||||
static inline void __extent_entry_push(struct bkey_i_extent *e)
|
||||
{
|
||||
@ -336,226 +401,26 @@ static inline void extent_ptr_append(struct bkey_i_extent *e,
|
||||
__extent_entry_push(e);
|
||||
}
|
||||
|
||||
static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
|
||||
const union bch_extent_crc *crc)
|
||||
static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
|
||||
{
|
||||
EBUG_ON(!k->size);
|
||||
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return (struct bch_extent_crc128) {
|
||||
._compressed_size = k->size - 1,
|
||||
._uncompressed_size = k->size - 1,
|
||||
};
|
||||
case BCH_EXTENT_CRC32:
|
||||
return (struct bch_extent_crc128) {
|
||||
.type = 1 << BCH_EXTENT_ENTRY_crc128,
|
||||
._compressed_size = crc->crc32._compressed_size,
|
||||
._uncompressed_size = crc->crc32._uncompressed_size,
|
||||
.offset = crc->crc32.offset,
|
||||
.csum_type = crc->crc32.csum_type,
|
||||
.compression_type = crc->crc32.compression_type,
|
||||
.csum.lo = crc->crc32.csum,
|
||||
};
|
||||
case BCH_EXTENT_CRC64:
|
||||
return (struct bch_extent_crc128) {
|
||||
.type = 1 << BCH_EXTENT_ENTRY_crc128,
|
||||
._compressed_size = crc->crc64._compressed_size,
|
||||
._uncompressed_size = crc->crc64._uncompressed_size,
|
||||
.offset = crc->crc64.offset,
|
||||
.nonce = crc->crc64.nonce,
|
||||
.csum_type = crc->crc64.csum_type,
|
||||
.compression_type = crc->crc64.compression_type,
|
||||
.csum.lo = crc->crc64.csum_lo,
|
||||
.csum.hi = crc->crc64.csum_hi,
|
||||
};
|
||||
case BCH_EXTENT_CRC128:
|
||||
return crc->crc128;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
#define crc_compressed_size(_k, _crc) \
|
||||
({ \
|
||||
unsigned _size = 0; \
|
||||
\
|
||||
switch (extent_crc_type(_crc)) { \
|
||||
case BCH_EXTENT_CRC_NONE: \
|
||||
_size = ((const struct bkey *) (_k))->size; \
|
||||
break; \
|
||||
case BCH_EXTENT_CRC32: \
|
||||
_size = ((struct bch_extent_crc32 *) _crc) \
|
||||
->_compressed_size + 1; \
|
||||
break; \
|
||||
case BCH_EXTENT_CRC64: \
|
||||
_size = ((struct bch_extent_crc64 *) _crc) \
|
||||
->_compressed_size + 1; \
|
||||
break; \
|
||||
case BCH_EXTENT_CRC128: \
|
||||
_size = ((struct bch_extent_crc128 *) _crc) \
|
||||
->_compressed_size + 1; \
|
||||
break; \
|
||||
} \
|
||||
_size; \
|
||||
})
|
||||
|
||||
#define crc_uncompressed_size(_k, _crc) \
|
||||
({ \
|
||||
unsigned _size = 0; \
|
||||
\
|
||||
switch (extent_crc_type(_crc)) { \
|
||||
case BCH_EXTENT_CRC_NONE: \
|
||||
_size = ((const struct bkey *) (_k))->size; \
|
||||
break; \
|
||||
case BCH_EXTENT_CRC32: \
|
||||
_size = ((struct bch_extent_crc32 *) _crc) \
|
||||
->_uncompressed_size + 1; \
|
||||
break; \
|
||||
case BCH_EXTENT_CRC64: \
|
||||
_size = ((struct bch_extent_crc64 *) _crc) \
|
||||
->_uncompressed_size + 1; \
|
||||
break; \
|
||||
case BCH_EXTENT_CRC128: \
|
||||
_size = ((struct bch_extent_crc128 *) _crc) \
|
||||
->_uncompressed_size + 1; \
|
||||
break; \
|
||||
} \
|
||||
_size; \
|
||||
})
|
||||
|
||||
static inline unsigned crc_offset(const union bch_extent_crc *crc)
|
||||
{
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return 0;
|
||||
case BCH_EXTENT_CRC32:
|
||||
return crc->crc32.offset;
|
||||
case BCH_EXTENT_CRC64:
|
||||
return crc->crc64.offset;
|
||||
case BCH_EXTENT_CRC128:
|
||||
return crc->crc128.offset;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned crc_nonce(const union bch_extent_crc *crc)
|
||||
{
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
case BCH_EXTENT_CRC32:
|
||||
return 0;
|
||||
case BCH_EXTENT_CRC64:
|
||||
return crc->crc64.nonce;
|
||||
case BCH_EXTENT_CRC128:
|
||||
return crc->crc128.nonce;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
|
||||
{
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return 0;
|
||||
case BCH_EXTENT_CRC32:
|
||||
return crc->crc32.csum_type;
|
||||
case BCH_EXTENT_CRC64:
|
||||
return crc->crc64.csum_type;
|
||||
case BCH_EXTENT_CRC128:
|
||||
return crc->crc128.csum_type;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
|
||||
{
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return 0;
|
||||
case BCH_EXTENT_CRC32:
|
||||
return crc->crc32.compression_type;
|
||||
case BCH_EXTENT_CRC64:
|
||||
return crc->crc64.compression_type;
|
||||
case BCH_EXTENT_CRC128:
|
||||
return crc->crc128.compression_type;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
|
||||
{
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return (struct bch_csum) { 0 };
|
||||
case BCH_EXTENT_CRC32:
|
||||
return (struct bch_csum) { .lo = crc->crc32.csum };
|
||||
case BCH_EXTENT_CRC64:
|
||||
return (struct bch_csum) {
|
||||
.lo = crc->crc64.csum_lo,
|
||||
.hi = crc->crc64.csum_hi,
|
||||
};
|
||||
case BCH_EXTENT_CRC128:
|
||||
return crc->crc128.csum;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_extent e;
|
||||
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
||||
const struct bch_extent_ptr *ptr;
|
||||
const union bch_extent_crc *crc;
|
||||
unsigned ret = 0;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
e = bkey_s_c_to_extent(k);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (!ptr->cached &&
|
||||
crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
|
||||
crc_compressed_size(e.k, crc) < k.k->size)
|
||||
ret = max_t(unsigned, ret,
|
||||
crc_compressed_size(e.k, crc));
|
||||
}
|
||||
extent_for_each_ptr(e, ptr)
|
||||
ret.devs[ret.nr++] = ptr->dev;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
|
||||
{
|
||||
const union bch_extent_crc *crc;
|
||||
|
||||
extent_for_each_crc(e, crc)
|
||||
if (bch2_csum_type_is_encryption(crc_csum_type(crc)))
|
||||
return crc_offset(crc) + crc_nonce(crc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_extent_narrow_crcs(struct bkey_s_extent);
|
||||
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
|
||||
struct bch_extent_crc_unpacked);
|
||||
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
|
||||
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
|
||||
|
||||
void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
|
||||
void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
|
||||
void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
|
||||
|
||||
const struct bch_extent_ptr *
|
||||
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
|
||||
struct bch_extent_ptr *
|
||||
bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
|
||||
struct bch_extent_ptr);
|
||||
struct bch_extent_ptr *
|
||||
bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
|
||||
struct bkey_s_c_extent);
|
||||
|
||||
bool bch2_cut_front(struct bpos, struct bkey_i *);
|
||||
bool bch2_cut_back(struct bpos, struct bkey *);
|
||||
void bch2_key_resize(struct bkey *, unsigned);
|
||||
|
27
libbcachefs/extents_types.h
Normal file
27
libbcachefs/extents_types.h
Normal file
@ -0,0 +1,27 @@
|
||||
#ifndef _BCACHEFS_EXTENTS_TYPES_H
|
||||
#define _BCACHEFS_EXTENTS_TYPES_H
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
struct bch_extent_crc_unpacked {
|
||||
u8 csum_type;
|
||||
u8 compression_type;
|
||||
|
||||
u16 compressed_size;
|
||||
u16 uncompressed_size;
|
||||
|
||||
u16 offset;
|
||||
u16 live_size;
|
||||
|
||||
u16 nonce;
|
||||
|
||||
struct bch_csum csum;
|
||||
};
|
||||
|
||||
struct extent_pick_ptr {
|
||||
struct bch_extent_ptr ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bch_dev *ca;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
@ -80,7 +80,7 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
|
||||
EBUG_ON(i >= size);
|
||||
|
||||
if (eytzinger1_left_child(i) < size) {
|
||||
i = eytzinger1_left_child(i);
|
||||
i = eytzinger1_left_child(i) + 1;
|
||||
|
||||
i <<= __fls(size) - __fls(i);
|
||||
i -= 1;
|
||||
@ -163,38 +163,6 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
|
||||
(_i) != 0; \
|
||||
(_i) = eytzinger1_next((_i), (_size)))
|
||||
|
||||
#if 0
|
||||
void eytzinger0_test(void)
|
||||
{
|
||||
unsigned i, j, size;
|
||||
|
||||
for (size = 2;
|
||||
size < 65536000;
|
||||
size++) {
|
||||
if (!(size % 4096))
|
||||
printk(KERN_INFO "tree size %u\n", size);
|
||||
|
||||
assert(eytzinger1_prev(0, size) == eytzinger1_last(size));
|
||||
assert(eytzinger1_next(0, size) == eytzinger1_first(size));
|
||||
|
||||
assert(eytzinger1_prev(eytzinger1_first(size), size) == 0);
|
||||
assert(eytzinger1_next(eytzinger1_last(size), size) == 0);
|
||||
|
||||
eytzinger1_for_each(j, size) {
|
||||
assert(from_inorder(i, size) == j);
|
||||
assert(to_inorder(j, size) == i);
|
||||
|
||||
if (j != eytzinger1_last(size)) {
|
||||
unsigned next = eytzinger1_next(j, size);
|
||||
|
||||
assert(eytzinger1_prev(next, size) == j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Zero based indexing version: */
|
||||
|
||||
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
|
||||
@ -214,27 +182,29 @@ static inline unsigned eytzinger0_right_child(unsigned i)
|
||||
return eytzinger0_child(i, 1);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static inline unsigned eytzinger0_first(unsigned size)
|
||||
{
|
||||
return eytzinger1_first(size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_last(unsigned size)
|
||||
{
|
||||
return eytzinger1_last(size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
|
||||
{
|
||||
return eytzinger1_next(i + 1, size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
|
||||
{
|
||||
return eytzinger1_prev(i + 1, size + 1) - 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline unsigned eytzinger0_extra(unsigned size)
|
||||
{
|
||||
return (size + 1 - rounddown_pow_of_two(size)) << 1;
|
||||
return eytzinger1_extra(size + 1);
|
||||
}
|
||||
|
||||
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
|
||||
@ -259,10 +229,41 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
|
||||
return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
|
||||
}
|
||||
|
||||
#define eytzinger0_for_each(_i, _size) \
|
||||
for ((_i) = eytzinger0_first((_size)); \
|
||||
(_i) != -1; \
|
||||
(_i) = eytzinger0_next((_i), (_size)))
|
||||
|
||||
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
|
||||
|
||||
/* return greatest node <= @search, or -1 if not found */
|
||||
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
|
||||
eytzinger_cmp_fn cmp, const void *search)
|
||||
{
|
||||
unsigned i, n = 0;
|
||||
|
||||
if (!nr)
|
||||
return -1;
|
||||
|
||||
do {
|
||||
i = n;
|
||||
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
|
||||
} while (n < nr);
|
||||
|
||||
if (n & 1) {
|
||||
/* @i was greater than @search, return previous node: */
|
||||
|
||||
if (i == eytzinger0_first(nr))
|
||||
return -1;
|
||||
|
||||
return eytzinger0_prev(i, nr);
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
|
||||
eytzinger_cmp_fn cmp, void *search)
|
||||
eytzinger_cmp_fn cmp, const void *search)
|
||||
{
|
||||
size_t i = 0;
|
||||
int res;
|
||||
@ -271,17 +272,6 @@ static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
|
||||
(res = cmp(search, base + i * size, size)))
|
||||
i = eytzinger0_child(i, res > 0);
|
||||
|
||||
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
|
||||
bool found1 = i < nr, found2 = false;
|
||||
size_t j;
|
||||
|
||||
for (j = 0; j < nr; j++)
|
||||
if (!cmp(base + j * size, search, size))
|
||||
found2 = true;
|
||||
|
||||
BUG_ON(found1 != found2);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
|
@ -26,9 +26,67 @@
|
||||
#include <trace/events/bcachefs.h>
|
||||
#include <trace/events/writeback.h>
|
||||
|
||||
struct bio_set *bch2_writepage_bioset;
|
||||
struct bio_set *bch2_dio_read_bioset;
|
||||
struct bio_set *bch2_dio_write_bioset;
|
||||
struct i_sectors_hook {
|
||||
struct extent_insert_hook hook;
|
||||
s64 sectors;
|
||||
struct bch_inode_info *inode;
|
||||
};
|
||||
|
||||
struct bchfs_write_op {
|
||||
struct bch_inode_info *inode;
|
||||
s64 sectors_added;
|
||||
bool is_dio;
|
||||
bool unalloc;
|
||||
u64 new_i_size;
|
||||
|
||||
/* must be last: */
|
||||
struct bch_write_op op;
|
||||
};
|
||||
|
||||
static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
|
||||
struct bch_inode_info *inode,
|
||||
bool is_dio)
|
||||
{
|
||||
op->inode = inode;
|
||||
op->sectors_added = 0;
|
||||
op->is_dio = is_dio;
|
||||
op->unalloc = false;
|
||||
op->new_i_size = U64_MAX;
|
||||
}
|
||||
|
||||
struct bch_writepage_io {
|
||||
struct closure cl;
|
||||
|
||||
/* must be last: */
|
||||
struct bchfs_write_op op;
|
||||
};
|
||||
|
||||
struct dio_write {
|
||||
struct closure cl;
|
||||
struct kiocb *req;
|
||||
struct bch_fs *c;
|
||||
long written;
|
||||
long error;
|
||||
loff_t offset;
|
||||
|
||||
struct disk_reservation res;
|
||||
|
||||
struct iovec *iovec;
|
||||
struct iovec inline_vecs[UIO_FASTIOV];
|
||||
struct iov_iter iter;
|
||||
|
||||
struct task_struct *task;
|
||||
|
||||
/* must be last: */
|
||||
struct bchfs_write_op iop;
|
||||
};
|
||||
|
||||
struct dio_read {
|
||||
struct closure cl;
|
||||
struct kiocb *req;
|
||||
long ret;
|
||||
struct bch_read_bio rbio;
|
||||
};
|
||||
|
||||
/* pagecache_block must be held */
|
||||
static int write_invalidate_inode_pages_range(struct address_space *mapping,
|
||||
@ -101,7 +159,7 @@ static inline void i_size_dirty_get(struct bch_inode_info *inode)
|
||||
|
||||
/* i_sectors accounting: */
|
||||
|
||||
static enum extent_insert_hook_ret
|
||||
static enum btree_insert_ret
|
||||
i_sectors_hook_fn(struct extent_insert_hook *hook,
|
||||
struct bpos committed_pos,
|
||||
struct bpos next_pos,
|
||||
@ -119,7 +177,7 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
|
||||
|
||||
h->sectors += sectors * sign;
|
||||
|
||||
return BTREE_HOOK_DO_INSERT;
|
||||
return BTREE_INSERT_OK;
|
||||
}
|
||||
|
||||
static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
|
||||
@ -208,7 +266,7 @@ struct bchfs_extent_trans_hook {
|
||||
bool need_inode_update;
|
||||
};
|
||||
|
||||
static enum extent_insert_hook_ret
|
||||
static enum btree_insert_ret
|
||||
bchfs_extent_update_hook(struct extent_insert_hook *hook,
|
||||
struct bpos committed_pos,
|
||||
struct bpos next_pos,
|
||||
@ -224,6 +282,10 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
|
||||
u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
|
||||
bool do_pack = false;
|
||||
|
||||
if (h->op->unalloc &&
|
||||
!bch2_extent_is_fully_allocated(k))
|
||||
return BTREE_INSERT_ENOSPC;
|
||||
|
||||
BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
|
||||
|
||||
/* XXX: inode->i_size locking */
|
||||
@ -232,7 +294,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
|
||||
|
||||
if (!h->need_inode_update) {
|
||||
h->need_inode_update = true;
|
||||
return BTREE_HOOK_RESTART_TRANS;
|
||||
return BTREE_INSERT_NEED_TRAVERSE;
|
||||
}
|
||||
|
||||
h->inode_u.bi_size = offset;
|
||||
@ -247,7 +309,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
|
||||
if (sectors) {
|
||||
if (!h->need_inode_update) {
|
||||
h->need_inode_update = true;
|
||||
return BTREE_HOOK_RESTART_TRANS;
|
||||
return BTREE_INSERT_NEED_TRAVERSE;
|
||||
}
|
||||
|
||||
h->inode_u.bi_sectors += sectors;
|
||||
@ -267,7 +329,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
|
||||
if (do_pack)
|
||||
bch2_inode_pack(&h->inode_p, &h->inode_u);
|
||||
|
||||
return BTREE_HOOK_DO_INSERT;
|
||||
return BTREE_INSERT_OK;
|
||||
}
|
||||
|
||||
static int bchfs_write_index_update(struct bch_write_op *wop)
|
||||
@ -352,12 +414,16 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
|
||||
BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
|
||||
BTREE_INSERT_ENTRY(&extent_iter, k));
|
||||
}
|
||||
|
||||
BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
|
||||
BUG_ON(!ret != !k->k.size);
|
||||
err:
|
||||
if (ret == -EINTR)
|
||||
continue;
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
|
||||
bch2_keylist_pop_front(keys);
|
||||
} while (!bch2_keylist_empty(keys));
|
||||
|
||||
@ -748,8 +814,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
|
||||
if (bkey_extent_is_allocation(k.k))
|
||||
bch2_add_page_sectors(bio, k);
|
||||
|
||||
if (!bkey_extent_is_allocation(k.k) ||
|
||||
bkey_extent_is_compressed(k))
|
||||
if (!bch2_extent_is_fully_allocated(k))
|
||||
bch2_mark_pages_unalloc(bio);
|
||||
|
||||
if (pick.ca) {
|
||||
@ -759,7 +824,8 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
|
||||
trace_read_split(&rbio->bio);
|
||||
}
|
||||
|
||||
bch2_read_extent(c, rbio, k, &pick, flags);
|
||||
bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
|
||||
&pick, flags);
|
||||
} else {
|
||||
zero_fill_bio(bio);
|
||||
|
||||
@ -963,22 +1029,20 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
|
||||
alloc_io:
|
||||
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
|
||||
BIO_MAX_PAGES,
|
||||
bch2_writepage_bioset),
|
||||
&c->writepage_bioset),
|
||||
struct bch_writepage_io, op.op.wbio.bio);
|
||||
|
||||
closure_init(&w->io->cl, NULL);
|
||||
w->io->op.inode = inode;
|
||||
w->io->op.sectors_added = 0;
|
||||
w->io->op.is_dio = false;
|
||||
bch2_fswrite_op_init(&w->io->op, inode, false);
|
||||
bch2_write_op_init(&w->io->op.op, c,
|
||||
(struct disk_reservation) {
|
||||
.nr_replicas = c->opts.data_replicas,
|
||||
},
|
||||
c->fastest_devs,
|
||||
inode->ei_last_dirtied,
|
||||
writepoint_hashed(inode->ei_last_dirtied),
|
||||
POS(inum, 0),
|
||||
&inode->ei_journal_seq,
|
||||
BCH_WRITE_THROTTLE);
|
||||
0);
|
||||
w->io->op.op.index_update_fn = bchfs_write_index_update;
|
||||
}
|
||||
|
||||
@ -1409,7 +1473,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
|
||||
|
||||
bio = bio_alloc_bioset(GFP_KERNEL,
|
||||
iov_iter_npages(iter, BIO_MAX_PAGES),
|
||||
bch2_dio_read_bioset);
|
||||
&c->dio_read_bioset);
|
||||
|
||||
bio->bi_end_io = bch2_direct_IO_read_endio;
|
||||
|
||||
@ -1541,20 +1605,19 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
|
||||
return;
|
||||
}
|
||||
|
||||
dio->iop.inode = inode;
|
||||
dio->iop.sectors_added = 0;
|
||||
dio->iop.is_dio = true;
|
||||
dio->iop.new_i_size = U64_MAX;
|
||||
bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
|
||||
dio->c->fastest_devs,
|
||||
(unsigned long) dio->task,
|
||||
writepoint_hashed((unsigned long) dio->task),
|
||||
POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
|
||||
&inode->ei_journal_seq,
|
||||
flags|BCH_WRITE_THROTTLE);
|
||||
flags);
|
||||
dio->iop.op.index_update_fn = bchfs_write_index_update;
|
||||
|
||||
dio->res.sectors -= bio_sectors(bio);
|
||||
dio->iop.op.res.sectors = bio_sectors(bio);
|
||||
if (!dio->iop.unalloc) {
|
||||
dio->res.sectors -= bio_sectors(bio);
|
||||
dio->iop.op.res.sectors = bio_sectors(bio);
|
||||
}
|
||||
|
||||
task_io_account_write(bio->bi_iter.bi_size);
|
||||
|
||||
@ -1589,6 +1652,31 @@ static void bch2_dio_write_loop_async(struct closure *cl)
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
|
||||
u64 size)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bpos end = pos;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
end.offset += size;
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
|
||||
BTREE_ITER_WITH_HOLES, k) {
|
||||
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
|
||||
break;
|
||||
|
||||
if (!bch2_extent_is_fully_allocated(k)) {
|
||||
ret = -ENOSPC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_direct_IO_write(struct bch_fs *c,
|
||||
struct kiocb *req, struct file *file,
|
||||
struct bch_inode_info *inode,
|
||||
@ -1610,17 +1698,18 @@ static int bch2_direct_IO_write(struct bch_fs *c,
|
||||
|
||||
bio = bio_alloc_bioset(GFP_KERNEL,
|
||||
iov_iter_npages(iter, BIO_MAX_PAGES),
|
||||
bch2_dio_write_bioset);
|
||||
&c->dio_write_bioset);
|
||||
dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
|
||||
dio->req = req;
|
||||
dio->c = c;
|
||||
dio->written = 0;
|
||||
dio->error = 0;
|
||||
dio->offset = offset;
|
||||
dio->iovec = NULL;
|
||||
dio->iter = *iter;
|
||||
dio->task = current;
|
||||
closure_init(&dio->cl, NULL);
|
||||
dio->req = req;
|
||||
dio->c = c;
|
||||
dio->written = 0;
|
||||
dio->error = 0;
|
||||
dio->offset = offset;
|
||||
dio->iovec = NULL;
|
||||
dio->iter = *iter;
|
||||
dio->task = current;
|
||||
bch2_fswrite_op_init(&dio->iop, inode, true);
|
||||
|
||||
if (offset + iter->count > inode->v.i_size)
|
||||
sync = true;
|
||||
@ -1635,9 +1724,15 @@ static int bch2_direct_IO_write(struct bch_fs *c,
|
||||
*/
|
||||
ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
|
||||
if (unlikely(ret)) {
|
||||
closure_debug_destroy(&dio->cl);
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
|
||||
offset >> 9),
|
||||
iter->count >> 9)) {
|
||||
closure_debug_destroy(&dio->cl);
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
dio->iop.unalloc = true;
|
||||
}
|
||||
|
||||
inode_dio_begin(&inode->v);
|
||||
@ -2318,7 +2413,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
|
||||
reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
|
||||
|
||||
if (reservation.v.nr_replicas < replicas ||
|
||||
bkey_extent_is_compressed(k)) {
|
||||
bch2_extent_is_compressed(k)) {
|
||||
ret = bch2_disk_reservation_get(c, &disk_res,
|
||||
sectors, 0);
|
||||
if (ret)
|
||||
@ -2564,4 +2659,24 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
void bch2_fs_fsio_exit(struct bch_fs *c)
|
||||
{
|
||||
bioset_exit(&c->dio_write_bioset);
|
||||
bioset_exit(&c->dio_read_bioset);
|
||||
bioset_exit(&c->writepage_bioset);
|
||||
}
|
||||
|
||||
int bch2_fs_fsio_init(struct bch_fs *c)
|
||||
{
|
||||
if (bioset_init(&c->writepage_bioset,
|
||||
4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
|
||||
bioset_init(&c->dio_read_bioset,
|
||||
4, offsetof(struct dio_read, rbio.bio)) ||
|
||||
bioset_init(&c->dio_write_bioset,
|
||||
4, offsetof(struct dio_write, iop.op.wbio.bio)))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* NO_BCACHEFS_FS */
|
||||
|
@ -1,7 +1,11 @@
|
||||
#ifndef _BCACHEFS_FS_IO_H
|
||||
#define _BCACHEFS_FS_IO_H
|
||||
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
#include "buckets.h"
|
||||
#include "io_types.h"
|
||||
|
||||
#include <linux/uio.h>
|
||||
|
||||
int bch2_set_page_dirty(struct page *);
|
||||
@ -35,60 +39,11 @@ int bch2_releasepage(struct page *, gfp_t);
|
||||
int bch2_migrate_page(struct address_space *, struct page *,
|
||||
struct page *, enum migrate_mode);
|
||||
|
||||
struct i_sectors_hook {
|
||||
struct extent_insert_hook hook;
|
||||
s64 sectors;
|
||||
struct bch_inode_info *inode;
|
||||
};
|
||||
|
||||
struct bchfs_write_op {
|
||||
struct bch_inode_info *inode;
|
||||
s64 sectors_added;
|
||||
bool is_dio;
|
||||
u64 new_i_size;
|
||||
|
||||
/* must be last: */
|
||||
struct bch_write_op op;
|
||||
};
|
||||
|
||||
struct bch_writepage_io {
|
||||
struct closure cl;
|
||||
|
||||
/* must be last: */
|
||||
struct bchfs_write_op op;
|
||||
};
|
||||
|
||||
extern struct bio_set *bch2_writepage_bioset;
|
||||
|
||||
struct dio_write {
|
||||
struct closure cl;
|
||||
struct kiocb *req;
|
||||
struct bch_fs *c;
|
||||
long written;
|
||||
long error;
|
||||
loff_t offset;
|
||||
|
||||
struct disk_reservation res;
|
||||
|
||||
struct iovec *iovec;
|
||||
struct iovec inline_vecs[UIO_FASTIOV];
|
||||
struct iov_iter iter;
|
||||
|
||||
struct task_struct *task;
|
||||
|
||||
/* must be last: */
|
||||
struct bchfs_write_op iop;
|
||||
};
|
||||
|
||||
extern struct bio_set *bch2_dio_write_bioset;
|
||||
|
||||
struct dio_read {
|
||||
struct closure cl;
|
||||
struct kiocb *req;
|
||||
long ret;
|
||||
struct bch_read_bio rbio;
|
||||
};
|
||||
|
||||
extern struct bio_set *bch2_dio_read_bioset;
|
||||
void bch2_fs_fsio_exit(struct bch_fs *);
|
||||
int bch2_fs_fsio_init(struct bch_fs *);
|
||||
#else
|
||||
static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_FS_IO_H */
|
||||
|
@ -654,17 +654,17 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
|
||||
if (bkey_extent_is_data(&k->k)) {
|
||||
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
const union bch_extent_crc *crc;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
int ret;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
int flags2 = 0;
|
||||
u64 offset = ptr->offset;
|
||||
|
||||
if (crc_compression_type(crc))
|
||||
if (crc.compression_type)
|
||||
flags2 |= FIEMAP_EXTENT_ENCODED;
|
||||
else
|
||||
offset += crc_offset(crc);
|
||||
offset += crc.offset;
|
||||
|
||||
if ((offset & (PAGE_SECTORS - 1)) ||
|
||||
(e.k->size & (PAGE_SECTORS - 1)))
|
||||
@ -1336,12 +1336,6 @@ MODULE_ALIAS_FS("bcachefs");
|
||||
void bch2_vfs_exit(void)
|
||||
{
|
||||
unregister_filesystem(&bcache_fs_type);
|
||||
if (bch2_dio_write_bioset)
|
||||
bioset_free(bch2_dio_write_bioset);
|
||||
if (bch2_dio_read_bioset)
|
||||
bioset_free(bch2_dio_read_bioset);
|
||||
if (bch2_writepage_bioset)
|
||||
bioset_free(bch2_writepage_bioset);
|
||||
if (bch2_inode_cache)
|
||||
kmem_cache_destroy(bch2_inode_cache);
|
||||
}
|
||||
@ -1354,20 +1348,6 @@ int __init bch2_vfs_init(void)
|
||||
if (!bch2_inode_cache)
|
||||
goto err;
|
||||
|
||||
bch2_writepage_bioset =
|
||||
bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
|
||||
if (!bch2_writepage_bioset)
|
||||
goto err;
|
||||
|
||||
bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
|
||||
if (!bch2_dio_read_bioset)
|
||||
goto err;
|
||||
|
||||
bch2_dio_write_bioset =
|
||||
bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
|
||||
if (!bch2_dio_write_bioset)
|
||||
goto err;
|
||||
|
||||
ret = register_filesystem(&bcache_fs_type);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
1225
libbcachefs/io.c
1225
libbcachefs/io.c
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,8 @@
|
||||
#define _BCACHEFS_IO_H
|
||||
|
||||
#include <linux/hash.h>
|
||||
#include "alloc.h"
|
||||
#include "checksum.h"
|
||||
#include "io_types.h"
|
||||
|
||||
#define to_wbio(_bio) \
|
||||
@ -12,6 +14,9 @@
|
||||
|
||||
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
||||
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
|
||||
void bch2_latency_acct(struct bch_dev *, unsigned, int);
|
||||
|
||||
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
||||
enum bch_data_type, const struct bkey_i *);
|
||||
@ -20,14 +25,15 @@ enum bch_write_flags {
|
||||
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
|
||||
BCH_WRITE_CACHED = (1 << 1),
|
||||
BCH_WRITE_FLUSH = (1 << 2),
|
||||
BCH_WRITE_DATA_COMPRESSED = (1 << 3),
|
||||
BCH_WRITE_THROTTLE = (1 << 4),
|
||||
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
|
||||
BCH_WRITE_DATA_ENCODED = (1 << 3),
|
||||
BCH_WRITE_PAGES_STABLE = (1 << 4),
|
||||
BCH_WRITE_PAGES_OWNED = (1 << 5),
|
||||
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
|
||||
|
||||
/* Internal: */
|
||||
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6),
|
||||
BCH_WRITE_DONE = (1 << 7),
|
||||
BCH_WRITE_LOOPED = (1 << 8),
|
||||
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7),
|
||||
BCH_WRITE_DONE = (1 << 8),
|
||||
BCH_WRITE_LOOPED = (1 << 9),
|
||||
};
|
||||
|
||||
static inline u64 *op_journal_seq(struct bch_write_op *op)
|
||||
@ -36,11 +42,60 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
|
||||
? op->journal_seq_p : &op->journal_seq;
|
||||
}
|
||||
|
||||
void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
|
||||
struct disk_reservation,
|
||||
struct bch_devs_mask *,
|
||||
unsigned long,
|
||||
struct bpos, u64 *, unsigned);
|
||||
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
|
||||
{
|
||||
return op->alloc_reserve == RESERVE_MOVINGGC
|
||||
? op->c->copygc_wq
|
||||
: op->c->wq;
|
||||
}
|
||||
|
||||
int bch2_write_index_default(struct bch_write_op *);
|
||||
|
||||
static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
|
||||
{
|
||||
op->c = c;
|
||||
op->io_wq = index_update_wq(op);
|
||||
op->flags = 0;
|
||||
op->written = 0;
|
||||
op->error = 0;
|
||||
op->csum_type = bch2_data_checksum_type(c);
|
||||
op->compression_type =
|
||||
bch2_compression_opt_to_type(c->opts.compression);
|
||||
op->nr_replicas = 0;
|
||||
op->nr_replicas_required = c->opts.data_replicas_required;
|
||||
op->alloc_reserve = RESERVE_NONE;
|
||||
op->open_buckets_nr = 0;
|
||||
op->devs_have.nr = 0;
|
||||
op->pos = POS_MAX;
|
||||
op->version = ZERO_VERSION;
|
||||
op->devs = NULL;
|
||||
op->write_point = (struct write_point_specifier) { 0 };
|
||||
op->res = (struct disk_reservation) { 0 };
|
||||
op->journal_seq = 0;
|
||||
op->index_update_fn = bch2_write_index_default;
|
||||
}
|
||||
|
||||
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
||||
struct disk_reservation res,
|
||||
struct bch_devs_mask *devs,
|
||||
struct write_point_specifier write_point,
|
||||
struct bpos pos,
|
||||
u64 *journal_seq, unsigned flags)
|
||||
{
|
||||
__bch2_write_op_init(op, c);
|
||||
op->flags = flags;
|
||||
op->nr_replicas = res.nr_replicas;
|
||||
op->pos = pos;
|
||||
op->res = res;
|
||||
op->devs = devs;
|
||||
op->write_point = write_point;
|
||||
|
||||
if (journal_seq) {
|
||||
op->journal_seq_p = journal_seq;
|
||||
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_write(struct closure *);
|
||||
|
||||
static inline struct bch_write_bio *wbio_init(struct bio *bio)
|
||||
@ -51,14 +106,13 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
|
||||
return wbio;
|
||||
}
|
||||
|
||||
void bch2_wake_delayed_writes(unsigned long data);
|
||||
|
||||
struct bch_devs_mask;
|
||||
struct cache_promote_op;
|
||||
struct extent_pick_ptr;
|
||||
|
||||
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
|
||||
struct bkey_s_c_extent e, struct extent_pick_ptr *,
|
||||
unsigned);
|
||||
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
u64, struct bch_devs_mask *, unsigned);
|
||||
|
||||
@ -66,21 +120,22 @@ enum bch_read_flags {
|
||||
BCH_READ_RETRY_IF_STALE = 1 << 0,
|
||||
BCH_READ_MAY_PROMOTE = 1 << 1,
|
||||
BCH_READ_USER_MAPPED = 1 << 2,
|
||||
BCH_READ_NODECODE = 1 << 3,
|
||||
|
||||
/* internal: */
|
||||
BCH_READ_MUST_BOUNCE = 1 << 3,
|
||||
BCH_READ_MUST_CLONE = 1 << 4,
|
||||
BCH_READ_IN_RETRY = 1 << 5,
|
||||
BCH_READ_MUST_BOUNCE = 1 << 4,
|
||||
BCH_READ_MUST_CLONE = 1 << 5,
|
||||
BCH_READ_IN_RETRY = 1 << 6,
|
||||
};
|
||||
|
||||
static inline void bch2_read_extent(struct bch_fs *c,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bkey_s_c k,
|
||||
struct bkey_s_c_extent e,
|
||||
struct extent_pick_ptr *pick,
|
||||
unsigned flags)
|
||||
{
|
||||
rbio->_state = 0;
|
||||
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
|
||||
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
|
||||
}
|
||||
|
||||
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
|
@ -1,20 +1,16 @@
|
||||
#ifndef _BCACHEFS_IO_TYPES_H
|
||||
#define _BCACHEFS_IO_TYPES_H
|
||||
|
||||
#include "alloc_types.h"
|
||||
#include "btree_types.h"
|
||||
#include "buckets_types.h"
|
||||
#include "extents_types.h"
|
||||
#include "keylist_types.h"
|
||||
#include "super_types.h"
|
||||
|
||||
#include <linux/llist.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
struct extent_pick_ptr {
|
||||
struct bch_extent_crc128 crc;
|
||||
struct bch_extent_ptr ptr;
|
||||
struct bch_dev *ca;
|
||||
};
|
||||
|
||||
struct bch_read_bio {
|
||||
struct bch_fs *c;
|
||||
|
||||
@ -44,26 +40,22 @@ struct bch_read_bio {
|
||||
struct {
|
||||
u8 bounce:1,
|
||||
split:1,
|
||||
process_context:1,
|
||||
retry:2;
|
||||
narrow_crcs:1,
|
||||
retry:2,
|
||||
context:2;
|
||||
};
|
||||
u8 _state;
|
||||
};
|
||||
|
||||
struct bch_devs_list devs_have;
|
||||
|
||||
struct extent_pick_ptr pick;
|
||||
/* start pos of data we read (may not be pos of data we want) */
|
||||
struct bpos pos;
|
||||
struct bversion version;
|
||||
|
||||
struct promote_op *promote;
|
||||
|
||||
/*
|
||||
* If we have to retry the read (IO error, checksum failure, read stale
|
||||
* data (raced with allocator), we retry the portion of the parent bio
|
||||
* that failed (i.e. this bio's portion, bvec_iter).
|
||||
*
|
||||
* But we need to stash the inode somewhere:
|
||||
*/
|
||||
u64 inode;
|
||||
|
||||
struct work_struct work;
|
||||
|
||||
struct bio bio;
|
||||
@ -98,36 +90,33 @@ struct bch_write_op {
|
||||
struct bch_fs *c;
|
||||
struct workqueue_struct *io_wq;
|
||||
|
||||
unsigned written; /* sectors */
|
||||
|
||||
short error;
|
||||
|
||||
u16 flags;
|
||||
u16 written; /* sectors */
|
||||
s8 error;
|
||||
|
||||
unsigned csum_type:4;
|
||||
unsigned compression_type:4;
|
||||
unsigned nr_replicas:4;
|
||||
unsigned nr_replicas_required:4;
|
||||
unsigned alloc_reserve:4;
|
||||
unsigned nonce:14;
|
||||
|
||||
u8 open_buckets_nr;
|
||||
struct bch_devs_list devs_have;
|
||||
u16 target;
|
||||
u16 nonce;
|
||||
|
||||
struct bpos pos;
|
||||
struct bversion version;
|
||||
|
||||
/* For BCH_WRITE_DATA_COMPRESSED: */
|
||||
struct bch_extent_crc128 crc;
|
||||
unsigned size;
|
||||
/* For BCH_WRITE_DATA_ENCODED: */
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
struct bch_devs_mask *devs;
|
||||
unsigned long write_point;
|
||||
struct write_point_specifier write_point;
|
||||
|
||||
struct disk_reservation res;
|
||||
|
||||
union {
|
||||
u8 open_buckets[16];
|
||||
struct {
|
||||
struct bch_write_op *next;
|
||||
unsigned long expires;
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
* If caller wants to flush but hasn't passed us a journal_seq ptr, we
|
||||
|
@ -464,7 +464,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
|
||||
if (invalid) {
|
||||
bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
|
||||
bkey_i_to_s_c(k));
|
||||
mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf);
|
||||
mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
|
||||
type, invalid, buf);
|
||||
|
||||
le16_add_cpu(&entry->u64s, -k->k.u64s);
|
||||
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
|
||||
@ -1568,35 +1569,31 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
|
||||
swap(new_buckets, ja->buckets);
|
||||
swap(new_bucket_seq, ja->bucket_seq);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
while (ja->nr < nr) {
|
||||
/* must happen under journal lock, to avoid racing with gc: */
|
||||
long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC);
|
||||
if (b < 0) {
|
||||
if (!closure_wait(&c->freelist_wait, &cl)) {
|
||||
spin_unlock(&j->lock);
|
||||
struct open_bucket *ob;
|
||||
size_t bucket;
|
||||
int ob_idx;
|
||||
|
||||
ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
|
||||
if (ob_idx < 0) {
|
||||
if (!closure_wait(&c->freelist_wait, &cl))
|
||||
closure_sync(&cl);
|
||||
spin_lock(&j->lock);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
bch2_mark_metadata_bucket(ca, &ca->buckets[b],
|
||||
BUCKET_JOURNAL, false);
|
||||
bch2_mark_alloc_bucket(ca, &ca->buckets[b], false);
|
||||
ob = c->open_buckets + ob_idx;
|
||||
bucket = sector_to_bucket(ca, ob->ptr.offset);
|
||||
|
||||
memmove(ja->buckets + ja->last_idx + 1,
|
||||
ja->buckets + ja->last_idx,
|
||||
(ja->nr - ja->last_idx) * sizeof(u64));
|
||||
memmove(ja->bucket_seq + ja->last_idx + 1,
|
||||
ja->bucket_seq + ja->last_idx,
|
||||
(ja->nr - ja->last_idx) * sizeof(u64));
|
||||
memmove(journal_buckets->buckets + ja->last_idx + 1,
|
||||
journal_buckets->buckets + ja->last_idx,
|
||||
(ja->nr - ja->last_idx) * sizeof(u64));
|
||||
spin_lock(&j->lock);
|
||||
__array_insert_item(ja->buckets, ja->nr, ja->last_idx);
|
||||
__array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
|
||||
__array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
|
||||
|
||||
ja->buckets[ja->last_idx] = b;
|
||||
journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
|
||||
ja->buckets[ja->last_idx] = bucket;
|
||||
ja->bucket_seq[ja->last_idx] = 0;
|
||||
journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
|
||||
|
||||
if (ja->last_idx < ja->nr) {
|
||||
if (ja->cur_idx >= ja->last_idx)
|
||||
@ -1604,9 +1601,14 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
||||
ja->last_idx++;
|
||||
}
|
||||
ja->nr++;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket],
|
||||
BUCKET_JOURNAL,
|
||||
gc_phase(GC_PHASE_SB), 0);
|
||||
|
||||
bch2_open_bucket_put(c, ob);
|
||||
}
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
|
||||
|
||||
@ -1623,6 +1625,8 @@ err:
|
||||
if (!ret)
|
||||
bch2_dev_allocator_add(c, ca);
|
||||
|
||||
closure_sync(&cl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -7,8 +7,7 @@ int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
|
||||
void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
|
||||
void bch2_keylist_pop_front(struct keylist *);
|
||||
|
||||
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys,
|
||||
size_t nr_inline_u64s)
|
||||
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
|
||||
{
|
||||
l->top_p = l->keys_p = inline_keys;
|
||||
}
|
||||
@ -17,7 +16,7 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
|
||||
{
|
||||
if (l->keys_p != inline_keys)
|
||||
kfree(l->keys_p);
|
||||
memset(l, 0, sizeof(*l));
|
||||
bch2_keylist_init(l, inline_keys);
|
||||
}
|
||||
|
||||
static inline void bch2_keylist_push(struct keylist *l)
|
||||
|
@ -13,31 +13,16 @@
|
||||
#include "move.h"
|
||||
#include "super-io.h"
|
||||
|
||||
static int issue_migration_move(struct bch_dev *ca,
|
||||
struct moving_context *ctxt,
|
||||
struct bch_devs_mask *devs,
|
||||
struct bkey_s_c k)
|
||||
static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct disk_reservation res;
|
||||
struct bch_dev *ca = arg;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
int ret;
|
||||
|
||||
if (bch2_disk_reservation_get(c, &res, k.k->size, 0))
|
||||
return -ENOSPC;
|
||||
|
||||
extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (ptr->dev == ca->dev_idx)
|
||||
goto found;
|
||||
return true;
|
||||
|
||||
BUG();
|
||||
found:
|
||||
/* XXX: we need to be doing something with the disk reservation */
|
||||
|
||||
ret = bch2_data_move(c, ctxt, devs, k, ptr);
|
||||
if (ret)
|
||||
bch2_disk_reservation_put(c, &res);
|
||||
return ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
#define MAX_DATA_OFF_ITER 10
|
||||
@ -58,10 +43,11 @@ found:
|
||||
|
||||
int bch2_move_data_off_device(struct bch_dev *ca)
|
||||
{
|
||||
struct moving_context ctxt;
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u64 keys_moved, sectors_moved;
|
||||
unsigned pass = 0;
|
||||
u64 seen_key_count;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
|
||||
@ -69,12 +55,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
|
||||
if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
|
||||
|
||||
bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
__set_bit(ca->dev_idx, ctxt.avoid.d);
|
||||
|
||||
/*
|
||||
* In theory, only one pass should be necessary as we've
|
||||
* quiesced all writes before calling this.
|
||||
@ -91,69 +71,43 @@ int bch2_move_data_off_device(struct bch_dev *ca)
|
||||
* Thus this scans the tree one more time than strictly necessary,
|
||||
* but that can be viewed as a verification pass.
|
||||
*/
|
||||
|
||||
do {
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
|
||||
seen_key_count = 0;
|
||||
atomic_set(&ctxt.error_count, 0);
|
||||
atomic_set(&ctxt.error_flags, 0);
|
||||
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
|
||||
BTREE_ITER_PREFETCH);
|
||||
|
||||
while (!bch2_move_ctxt_wait(&ctxt) &&
|
||||
(k = bch2_btree_iter_peek(&iter)).k &&
|
||||
!(ret = btree_iter_err(k))) {
|
||||
if (!bkey_extent_is_data(k.k) ||
|
||||
!bch2_extent_has_device(bkey_s_c_to_extent(k),
|
||||
ca->dev_idx))
|
||||
goto next;
|
||||
|
||||
ret = issue_migration_move(ca, &ctxt, NULL, k);
|
||||
if (ret == -ENOMEM) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
/*
|
||||
* memory allocation failure, wait for some IO
|
||||
* to finish
|
||||
*/
|
||||
bch2_move_ctxt_wait_for_io(&ctxt);
|
||||
continue;
|
||||
}
|
||||
if (ret == -ENOSPC)
|
||||
break;
|
||||
BUG_ON(ret);
|
||||
|
||||
seen_key_count++;
|
||||
continue;
|
||||
next:
|
||||
if (bkey_extent_is_data(k.k)) {
|
||||
ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
|
||||
BCH_DATA_USER);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
bch2_btree_iter_cond_resched(&iter);
|
||||
|
||||
ret = bch2_move_data(c, NULL,
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE,
|
||||
NULL,
|
||||
writepoint_hashed((unsigned long) current),
|
||||
0,
|
||||
ca->dev_idx,
|
||||
migrate_pred, ca,
|
||||
&keys_moved,
|
||||
§ors_moved);
|
||||
if (ret) {
|
||||
bch_err(c, "error migrating data: %i", ret);
|
||||
return ret;
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_move_ctxt_exit(&ctxt);
|
||||
} while (keys_moved && pass++ < MAX_DATA_OFF_ITER);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
|
||||
|
||||
if (seen_key_count) {
|
||||
pr_err("Unable to migrate all data in %d iterations.",
|
||||
MAX_DATA_OFF_ITER);
|
||||
ret = -1;
|
||||
goto err;
|
||||
if (keys_moved) {
|
||||
bch_err(c, "unable to migrate all data in %d iterations",
|
||||
MAX_DATA_OFF_ITER);
|
||||
return -1;
|
||||
}
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
|
||||
if (!bkey_extent_is_data(k.k))
|
||||
continue;
|
||||
|
||||
ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
|
||||
BCH_DATA_USER);
|
||||
if (ret) {
|
||||
bch_err(c, "error migrating data %i from check_mark_super()", ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
err:
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
return ret;
|
||||
@ -167,14 +121,11 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
|
||||
enum btree_id id)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct closure cl;
|
||||
struct btree *b;
|
||||
int ret;
|
||||
|
||||
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
|
||||
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
|
||||
|
||||
|
@ -9,41 +9,38 @@
|
||||
#include "keylist.h"
|
||||
|
||||
#include <linux/ioprio.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
|
||||
struct bkey_s_extent e,
|
||||
struct bch_extent_ptr ptr)
|
||||
{
|
||||
struct bch_extent_ptr *ptr2;
|
||||
struct bch_dev *ca = c->devs[ptr.dev];
|
||||
struct moving_io {
|
||||
struct list_head list;
|
||||
struct closure cl;
|
||||
bool read_completed;
|
||||
unsigned sectors;
|
||||
|
||||
extent_for_each_ptr(e, ptr2)
|
||||
if (ptr2->dev == ptr.dev &&
|
||||
ptr2->gen == ptr.gen &&
|
||||
PTR_BUCKET_NR(ca, ptr2) ==
|
||||
PTR_BUCKET_NR(ca, &ptr))
|
||||
return ptr2;
|
||||
struct bch_read_bio rbio;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
struct migrate_write write;
|
||||
/* Must be last since it is variable size */
|
||||
struct bio_vec bi_inline_vecs[0];
|
||||
};
|
||||
|
||||
static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
|
||||
struct bkey_s_extent e)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_ptr *ret;
|
||||
struct moving_context {
|
||||
/* Closure for waiting on all reads and writes to complete */
|
||||
struct closure cl;
|
||||
|
||||
if (m->move)
|
||||
ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
|
||||
else
|
||||
extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
|
||||
if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
|
||||
break;
|
||||
/* Key and sector moves issued, updated from submission context */
|
||||
u64 keys_moved;
|
||||
u64 sectors_moved;
|
||||
atomic64_t sectors_raced;
|
||||
|
||||
return ret;
|
||||
}
|
||||
struct list_head reads;
|
||||
|
||||
atomic_t sectors_in_flight;
|
||||
|
||||
wait_queue_head_t wait;
|
||||
};
|
||||
|
||||
static int bch2_migrate_index_update(struct bch_write_op *op)
|
||||
{
|
||||
@ -59,71 +56,78 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
|
||||
BTREE_ITER_INTENT);
|
||||
|
||||
while (1) {
|
||||
struct bkey_s_extent insert =
|
||||
bkey_i_to_s_extent(bch2_keylist_front(keys));
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
|
||||
struct bkey_i_extent *insert, *new =
|
||||
bkey_i_to_extent(bch2_keylist_front(keys));
|
||||
BKEY_PADDED(k) _new, _insert;
|
||||
struct bch_extent_ptr *ptr;
|
||||
struct bkey_s_extent e;
|
||||
BKEY_PADDED(k) new;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
bool did_work = false;
|
||||
|
||||
if (!k.k) {
|
||||
if (btree_iter_err(k)) {
|
||||
ret = bch2_btree_iter_unlock(&iter);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!bkey_extent_is_data(k.k))
|
||||
if (bversion_cmp(k.k->version, new->k.version) ||
|
||||
!bkey_extent_is_data(k.k) ||
|
||||
!bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
|
||||
m->ptr, m->offset))
|
||||
goto nomatch;
|
||||
|
||||
bkey_reassemble(&new.k, k);
|
||||
bch2_cut_front(iter.pos, &new.k);
|
||||
bch2_cut_back(insert.k->p, &new.k.k);
|
||||
e = bkey_i_to_s_extent(&new.k);
|
||||
bkey_reassemble(&_insert.k, k);
|
||||
insert = bkey_i_to_extent(&_insert.k);
|
||||
|
||||
/* hack - promotes can race: */
|
||||
if (m->promote)
|
||||
extent_for_each_ptr(insert, ptr)
|
||||
if (bch2_extent_has_device(e.c, ptr->dev))
|
||||
goto nomatch;
|
||||
bkey_copy(&_new.k, bch2_keylist_front(keys));
|
||||
new = bkey_i_to_extent(&_new.k);
|
||||
|
||||
ptr = bch2_migrate_matching_ptr(m, e);
|
||||
if (ptr) {
|
||||
int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c);
|
||||
unsigned insert_flags =
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL;
|
||||
bch2_cut_front(iter.pos, &insert->k_i);
|
||||
bch2_cut_back(new->k.p, &insert->k);
|
||||
bch2_cut_back(insert->k.p, &new->k);
|
||||
|
||||
/* copygc uses btree node reserve: */
|
||||
if (m->move)
|
||||
insert_flags |= BTREE_INSERT_USE_RESERVE;
|
||||
if (m->move_dev >= 0 &&
|
||||
(ptr = (struct bch_extent_ptr *)
|
||||
bch2_extent_has_device(extent_i_to_s_c(insert),
|
||||
m->move_dev)))
|
||||
bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
|
||||
|
||||
if (m->move) {
|
||||
nr_new_dirty -= !ptr->cached;
|
||||
__bch2_extent_drop_ptr(e, ptr);
|
||||
|
||||
extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
|
||||
if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
|
||||
/*
|
||||
* raced with another move op? extent already
|
||||
* has a pointer to the device we just wrote
|
||||
* data to
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
BUG_ON(nr_new_dirty < 0);
|
||||
|
||||
memcpy_u64s(extent_entry_last(e),
|
||||
insert.v,
|
||||
bkey_val_u64s(insert.k));
|
||||
e.k->u64s += bkey_val_u64s(insert.k);
|
||||
|
||||
bch2_extent_narrow_crcs(e);
|
||||
bch2_extent_drop_redundant_crcs(e);
|
||||
bch2_extent_normalize(c, e.s);
|
||||
bch2_extent_mark_replicas_cached(c, e, nr_new_dirty);
|
||||
|
||||
ret = bch2_btree_insert_at(c, &op->res,
|
||||
NULL, op_journal_seq(op),
|
||||
insert_flags,
|
||||
BTREE_INSERT_ENTRY(&iter, &new.k));
|
||||
if (ret && ret != -EINTR)
|
||||
break;
|
||||
} else {
|
||||
nomatch:
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
bch2_extent_crc_append(insert, crc);
|
||||
extent_ptr_append(insert, *ptr);
|
||||
did_work = true;
|
||||
}
|
||||
|
||||
if (!did_work)
|
||||
goto nomatch;
|
||||
|
||||
bch2_extent_narrow_crcs(insert,
|
||||
(struct bch_extent_crc_unpacked) { 0 });
|
||||
bch2_extent_normalize(c, extent_i_to_s(insert).s);
|
||||
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
|
||||
|
||||
ret = bch2_btree_insert_at(c, &op->res,
|
||||
NULL, op_journal_seq(op),
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL|
|
||||
m->btree_insert_flags,
|
||||
BTREE_INSERT_ENTRY(&iter, &insert->k_i));
|
||||
if (!ret)
|
||||
atomic_long_inc(&c->extent_migrate_done);
|
||||
if (ret == -EINTR)
|
||||
ret = 0;
|
||||
if (ret)
|
||||
break;
|
||||
next:
|
||||
while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
|
||||
bch2_keylist_pop_front(keys);
|
||||
if (bch2_keylist_empty(keys))
|
||||
@ -131,96 +135,83 @@ nomatch:
|
||||
}
|
||||
|
||||
bch2_cut_front(iter.pos, bch2_keylist_front(keys));
|
||||
continue;
|
||||
nomatch:
|
||||
if (m->ctxt)
|
||||
atomic64_add(k.k->p.offset - iter.pos.offset,
|
||||
&m->ctxt->sectors_raced);
|
||||
atomic_long_inc(&c->extent_migrate_raced);
|
||||
trace_move_race(&new->k);
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
goto next;
|
||||
}
|
||||
out:
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_migrate_write_init(struct bch_fs *c,
|
||||
struct migrate_write *m,
|
||||
struct bch_devs_mask *devs,
|
||||
struct bkey_s_c k,
|
||||
const struct bch_extent_ptr *move_ptr,
|
||||
unsigned flags)
|
||||
void bch2_migrate_write_init(struct migrate_write *m,
|
||||
struct bch_read_bio *rbio)
|
||||
{
|
||||
bkey_reassemble(&m->key, k);
|
||||
/* write bio must own pages: */
|
||||
BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
||||
|
||||
m->promote = false;
|
||||
m->move = move_ptr != NULL;
|
||||
if (move_ptr)
|
||||
m->move_ptr = *move_ptr;
|
||||
m->ptr = rbio->pick.ptr;
|
||||
m->offset = rbio->pos.offset - rbio->pick.crc.offset;
|
||||
m->op.devs_have = rbio->devs_have;
|
||||
m->op.pos = rbio->pos;
|
||||
m->op.version = rbio->version;
|
||||
m->op.crc = rbio->pick.crc;
|
||||
|
||||
if (bkey_extent_is_cached(k.k) ||
|
||||
(move_ptr && move_ptr->cached))
|
||||
flags |= BCH_WRITE_CACHED;
|
||||
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
|
||||
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
|
||||
m->op.csum_type = m->op.crc.csum_type;
|
||||
}
|
||||
|
||||
bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
|
||||
devs, (unsigned long) current,
|
||||
bkey_start_pos(k.k), NULL,
|
||||
flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
|
||||
if (m->move_dev >= 0)
|
||||
bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
|
||||
|
||||
if (m->move)
|
||||
if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
|
||||
m->op.alloc_reserve = RESERVE_MOVINGGC;
|
||||
|
||||
m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k));
|
||||
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
|
||||
BCH_WRITE_PAGES_STABLE|
|
||||
BCH_WRITE_PAGES_OWNED|
|
||||
BCH_WRITE_DATA_ENCODED;
|
||||
|
||||
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
||||
m->op.nr_replicas = 1;
|
||||
m->op.nr_replicas_required = 1;
|
||||
m->op.index_update_fn = bch2_migrate_index_update;
|
||||
}
|
||||
|
||||
static void migrate_bio_init(struct moving_io *io, struct bio *bio,
|
||||
unsigned sectors)
|
||||
static void move_free(struct closure *cl)
|
||||
{
|
||||
bio_init(bio, io->bi_inline_vecs,
|
||||
DIV_ROUND_UP(sectors, PAGE_SECTORS));
|
||||
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
|
||||
bio->bi_iter.bi_size = sectors << 9;
|
||||
bio->bi_private = &io->cl;
|
||||
bch2_bio_map(bio, NULL);
|
||||
}
|
||||
|
||||
static void moving_io_free(struct moving_io *io)
|
||||
{
|
||||
struct moving_context *ctxt = io->ctxt;
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
|
||||
atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
|
||||
wake_up(&ctxt->wait);
|
||||
|
||||
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
|
||||
if (bv->bv_page)
|
||||
__free_page(bv->bv_page);
|
||||
|
||||
atomic_sub(io->sectors, &ctxt->sectors_in_flight);
|
||||
wake_up(&ctxt->wait);
|
||||
|
||||
kfree(io);
|
||||
}
|
||||
|
||||
static void moving_error(struct moving_context *ctxt, unsigned flag)
|
||||
{
|
||||
atomic_inc(&ctxt->error_count);
|
||||
//atomic_or(flag, &ctxt->error_flags);
|
||||
}
|
||||
|
||||
static void moving_write_done(struct closure *cl)
|
||||
static void move_write(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
|
||||
if (io->write.op.error)
|
||||
moving_error(io->ctxt, MOVING_FLAG_WRITE);
|
||||
if (likely(!io->rbio.bio.bi_error)) {
|
||||
bch2_migrate_write_init(&io->write, &io->rbio);
|
||||
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
|
||||
}
|
||||
|
||||
//if (io->replace.failures)
|
||||
// trace_copy_collision(q, &io->key.k);
|
||||
|
||||
moving_io_free(io);
|
||||
}
|
||||
|
||||
static void write_moving(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
struct bch_write_op *op = &io->write.op;
|
||||
|
||||
closure_call(&op->cl, bch2_write, NULL, &io->cl);
|
||||
closure_return_with_destructor(&io->cl, moving_write_done);
|
||||
closure_return_with_destructor(cl, move_free);
|
||||
}
|
||||
|
||||
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
|
||||
@ -231,16 +222,10 @@ static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
|
||||
return io && io->read_completed ? io : NULL;
|
||||
}
|
||||
|
||||
static void read_moving_endio(struct bio *bio)
|
||||
static void move_read_endio(struct bio *bio)
|
||||
{
|
||||
struct closure *cl = bio->bi_private;
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
struct moving_context *ctxt = io->ctxt;
|
||||
|
||||
trace_move_read_done(&io->write.key.k);
|
||||
|
||||
if (bio->bi_error)
|
||||
moving_error(io->ctxt, MOVING_FLAG_READ);
|
||||
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
|
||||
io->read_completed = true;
|
||||
if (next_pending_write(ctxt))
|
||||
@ -249,58 +234,81 @@ static void read_moving_endio(struct bio *bio)
|
||||
closure_put(&ctxt->cl);
|
||||
}
|
||||
|
||||
int bch2_data_move(struct bch_fs *c,
|
||||
struct moving_context *ctxt,
|
||||
struct bch_devs_mask *devs,
|
||||
struct bkey_s_c k,
|
||||
const struct bch_extent_ptr *move_ptr)
|
||||
static int bch2_move_extent(struct bch_fs *c,
|
||||
struct moving_context *ctxt,
|
||||
struct bch_devs_mask *devs,
|
||||
struct write_point_specifier wp,
|
||||
int btree_insert_flags,
|
||||
int move_device,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct extent_pick_ptr pick;
|
||||
struct moving_io *io;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
unsigned sectors = k.k->size, pages;
|
||||
|
||||
bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
|
||||
bch2_extent_pick_ptr(c, k, NULL, &pick);
|
||||
if (IS_ERR_OR_NULL(pick.ca))
|
||||
return pick.ca ? PTR_ERR(pick.ca) : 0;
|
||||
|
||||
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
|
||||
DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
|
||||
/* write path might have to decompress data: */
|
||||
extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc)
|
||||
sectors = max_t(unsigned, sectors, crc.uncompressed_size);
|
||||
|
||||
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
||||
io = kzalloc(sizeof(struct moving_io) +
|
||||
sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
||||
if (!io)
|
||||
return -ENOMEM;
|
||||
goto err;
|
||||
|
||||
io->ctxt = ctxt;
|
||||
io->write.ctxt = ctxt;
|
||||
io->sectors = k.k->size;
|
||||
|
||||
migrate_bio_init(io, &io->rbio.bio, k.k->size);
|
||||
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
|
||||
bio_set_prio(&io->write.op.wbio.bio,
|
||||
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
|
||||
|
||||
bch2_bio_map(&io->write.op.wbio.bio, NULL);
|
||||
if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) {
|
||||
kfree(io);
|
||||
goto err;
|
||||
}
|
||||
|
||||
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
|
||||
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
||||
|
||||
bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
|
||||
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
||||
io->rbio.bio.bi_end_io = read_moving_endio;
|
||||
io->rbio.bio.bi_end_io = move_read_endio;
|
||||
|
||||
if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
|
||||
kfree(io);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
|
||||
|
||||
bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
|
||||
|
||||
trace_move_read(&io->write.key.k);
|
||||
__bch2_write_op_init(&io->write.op, c);
|
||||
io->write.btree_insert_flags = btree_insert_flags;
|
||||
io->write.move_dev = move_device;
|
||||
io->write.op.devs = devs;
|
||||
io->write.op.write_point = wp;
|
||||
|
||||
ctxt->keys_moved++;
|
||||
ctxt->sectors_moved += k.k->size;
|
||||
if (ctxt->rate)
|
||||
bch2_ratelimit_increment(ctxt->rate, k.k->size);
|
||||
|
||||
atomic_add(k.k->size, &ctxt->sectors_in_flight);
|
||||
trace_move_extent(k.k);
|
||||
|
||||
atomic_add(io->sectors, &ctxt->sectors_in_flight);
|
||||
list_add_tail(&io->list, &ctxt->reads);
|
||||
|
||||
/*
|
||||
* dropped by read_moving_endio() - guards against use after free of
|
||||
* dropped by move_read_endio() - guards against use after free of
|
||||
* ctxt when doing wakeup
|
||||
*/
|
||||
closure_get(&io->ctxt->cl);
|
||||
bch2_read_extent(c, &io->rbio, k, &pick, 0);
|
||||
closure_get(&ctxt->cl);
|
||||
bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k),
|
||||
&pick, BCH_READ_NODECODE);
|
||||
return 0;
|
||||
err:
|
||||
trace_move_alloc_fail(k.k);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void do_pending_writes(struct moving_context *ctxt)
|
||||
@ -309,14 +317,7 @@ static void do_pending_writes(struct moving_context *ctxt)
|
||||
|
||||
while ((io = next_pending_write(ctxt))) {
|
||||
list_del(&io->list);
|
||||
|
||||
if (io->rbio.bio.bi_error) {
|
||||
moving_io_free(io);
|
||||
continue;
|
||||
}
|
||||
|
||||
trace_move_write(&io->write.key.k);
|
||||
closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
|
||||
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
|
||||
}
|
||||
}
|
||||
|
||||
@ -330,18 +331,7 @@ do { \
|
||||
next_pending_write(_ctxt) || (_cond)); \
|
||||
} while (1)
|
||||
|
||||
int bch2_move_ctxt_wait(struct moving_context *ctxt)
|
||||
{
|
||||
move_ctxt_wait_event(ctxt,
|
||||
atomic_read(&ctxt->sectors_in_flight) <
|
||||
ctxt->max_sectors_in_flight);
|
||||
|
||||
return ctxt->rate
|
||||
? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate)
|
||||
: 0;
|
||||
}
|
||||
|
||||
void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
|
||||
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
|
||||
{
|
||||
unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
|
||||
|
||||
@ -350,7 +340,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
|
||||
atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
|
||||
}
|
||||
|
||||
void bch2_move_ctxt_exit(struct moving_context *ctxt)
|
||||
static void bch2_move_ctxt_exit(struct moving_context *ctxt)
|
||||
{
|
||||
move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
|
||||
closure_sync(&ctxt->cl);
|
||||
@ -359,16 +349,92 @@ void bch2_move_ctxt_exit(struct moving_context *ctxt)
|
||||
EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
|
||||
}
|
||||
|
||||
void bch2_move_ctxt_init(struct moving_context *ctxt,
|
||||
struct bch_ratelimit *rate,
|
||||
unsigned max_sectors_in_flight)
|
||||
static void bch2_move_ctxt_init(struct moving_context *ctxt)
|
||||
{
|
||||
memset(ctxt, 0, sizeof(*ctxt));
|
||||
closure_init_stack(&ctxt->cl);
|
||||
|
||||
ctxt->rate = rate;
|
||||
ctxt->max_sectors_in_flight = max_sectors_in_flight;
|
||||
|
||||
INIT_LIST_HEAD(&ctxt->reads);
|
||||
init_waitqueue_head(&ctxt->wait);
|
||||
}
|
||||
|
||||
int bch2_move_data(struct bch_fs *c,
|
||||
struct bch_ratelimit *rate,
|
||||
unsigned sectors_in_flight,
|
||||
struct bch_devs_mask *devs,
|
||||
struct write_point_specifier wp,
|
||||
int btree_insert_flags,
|
||||
int move_device,
|
||||
move_pred_fn pred, void *arg,
|
||||
u64 *keys_moved,
|
||||
u64 *sectors_moved)
|
||||
{
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
struct moving_context ctxt;
|
||||
struct btree_iter iter;
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_move_ctxt_init(&ctxt);
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
|
||||
BTREE_ITER_PREFETCH);
|
||||
|
||||
if (rate)
|
||||
bch2_ratelimit_reset(rate);
|
||||
|
||||
while (!kthread || !(ret = kthread_should_stop())) {
|
||||
if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
move_ctxt_wait_event(&ctxt,
|
||||
atomic_read(&ctxt.sectors_in_flight) <
|
||||
sectors_in_flight);
|
||||
}
|
||||
|
||||
if (rate &&
|
||||
bch2_ratelimit_delay(rate) &&
|
||||
(bch2_btree_iter_unlock(&iter),
|
||||
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
|
||||
break;
|
||||
|
||||
k = bch2_btree_iter_peek(&iter);
|
||||
if (!k.k)
|
||||
break;
|
||||
ret = btree_iter_err(k);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (!bkey_extent_is_data(k.k) ||
|
||||
!pred(arg, bkey_s_c_to_extent(k)))
|
||||
goto next;
|
||||
|
||||
/* unlock before doing IO: */
|
||||
bkey_reassemble(&tmp.k, k);
|
||||
k = bkey_i_to_s_c(&tmp.k);
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
if (bch2_move_extent(c, &ctxt, devs, wp,
|
||||
btree_insert_flags,
|
||||
move_device, k)) {
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(&ctxt);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (rate)
|
||||
bch2_ratelimit_increment(rate, k.k->size);
|
||||
next:
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
bch2_btree_iter_cond_resched(&iter);
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_move_ctxt_exit(&ctxt);
|
||||
|
||||
trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
|
||||
|
||||
*keys_moved = ctxt.keys_moved;
|
||||
*sectors_moved = ctxt.sectors_moved;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -4,77 +4,31 @@
|
||||
#include "buckets.h"
|
||||
#include "io_types.h"
|
||||
|
||||
enum moving_flag_bitnos {
|
||||
MOVING_FLAG_BITNO_READ = 0,
|
||||
MOVING_FLAG_BITNO_WRITE,
|
||||
};
|
||||
|
||||
#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ)
|
||||
#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE)
|
||||
struct bch_read_bio;
|
||||
struct moving_context;
|
||||
|
||||
struct migrate_write {
|
||||
BKEY_PADDED(key);
|
||||
bool promote;
|
||||
bool move;
|
||||
struct bch_extent_ptr move_ptr;
|
||||
struct moving_context *ctxt;
|
||||
|
||||
/* what we read: */
|
||||
struct bch_extent_ptr ptr;
|
||||
u64 offset;
|
||||
|
||||
int move_dev;
|
||||
int btree_insert_flags;
|
||||
struct bch_write_op op;
|
||||
};
|
||||
|
||||
void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
|
||||
struct bch_devs_mask *, struct bkey_s_c,
|
||||
const struct bch_extent_ptr *, unsigned);
|
||||
void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
|
||||
|
||||
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
|
||||
|
||||
struct moving_context {
|
||||
/* Closure for waiting on all reads and writes to complete */
|
||||
struct closure cl;
|
||||
typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
|
||||
|
||||
/* Number and types of errors reported */
|
||||
atomic_t error_count;
|
||||
atomic_t error_flags;
|
||||
|
||||
/* Key and sector moves issued, updated from submission context */
|
||||
u64 keys_moved;
|
||||
u64 sectors_moved;
|
||||
|
||||
/* Rate-limiter counting submitted reads */
|
||||
struct bch_ratelimit *rate;
|
||||
|
||||
/* Try to avoid reading the following device */
|
||||
struct bch_devs_mask avoid;
|
||||
|
||||
struct list_head reads;
|
||||
|
||||
/* Configuration */
|
||||
unsigned max_sectors_in_flight;
|
||||
atomic_t sectors_in_flight;
|
||||
|
||||
wait_queue_head_t wait;
|
||||
};
|
||||
|
||||
struct moving_io {
|
||||
struct list_head list;
|
||||
struct rb_node node;
|
||||
struct closure cl;
|
||||
struct moving_context *ctxt;
|
||||
struct migrate_write write;
|
||||
bool read_completed;
|
||||
|
||||
struct bch_read_bio rbio;
|
||||
/* Must be last since it is variable size */
|
||||
struct bio_vec bi_inline_vecs[0];
|
||||
};
|
||||
|
||||
int bch2_data_move(struct bch_fs *, struct moving_context *,
|
||||
struct bch_devs_mask *, struct bkey_s_c,
|
||||
const struct bch_extent_ptr *);
|
||||
|
||||
int bch2_move_ctxt_wait(struct moving_context *);
|
||||
void bch2_move_ctxt_wait_for_io(struct moving_context *);
|
||||
|
||||
void bch2_move_ctxt_exit(struct moving_context *);
|
||||
void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
|
||||
unsigned);
|
||||
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
|
||||
unsigned, struct bch_devs_mask *,
|
||||
struct write_point_specifier,
|
||||
int, int, move_pred_fn, void *,
|
||||
u64 *, u64 *);
|
||||
|
||||
#endif /* _BCACHEFS_MOVE_H */
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_update.h"
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "extents.h"
|
||||
@ -23,137 +24,63 @@
|
||||
#include <linux/sort.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
/* Moving GC - IO loop */
|
||||
/*
|
||||
* We can't use the entire copygc reserve in one iteration of copygc: we may
|
||||
* need the buckets we're freeing up to go back into the copygc reserve to make
|
||||
* forward progress, but if the copygc reserve is full they'll be available for
|
||||
* any allocation - and it's possible that in a given iteration, we free up most
|
||||
* of the buckets we're going to free before we allocate most of the buckets
|
||||
* we're going to allocate.
|
||||
*
|
||||
* If we only use half of the reserve per iteration, then in steady state we'll
|
||||
* always have room in the reserve for the buckets we're going to need in the
|
||||
* next iteration:
|
||||
*/
|
||||
#define COPYGC_BUCKETS_PER_ITER(ca) \
|
||||
((ca)->free[RESERVE_MOVINGGC].size / 2)
|
||||
|
||||
static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
|
||||
/*
|
||||
* Max sectors to move per iteration: Have to take into account internal
|
||||
* fragmentation from the multiple write points for each generation:
|
||||
*/
|
||||
#define COPYGC_SECTORS_PER_ITER(ca) \
|
||||
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
|
||||
|
||||
static inline int sectors_used_cmp(copygc_heap *heap,
|
||||
struct copygc_heap_entry l,
|
||||
struct copygc_heap_entry r)
|
||||
{
|
||||
const struct bucket_heap_entry *l = _l;
|
||||
const struct bucket_heap_entry *r = _r;
|
||||
|
||||
if (l->bucket < r->bucket)
|
||||
return -1;
|
||||
if (l->bucket > r->bucket)
|
||||
return 1;
|
||||
return 0;
|
||||
return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
|
||||
}
|
||||
|
||||
static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
|
||||
struct bkey_s_c k)
|
||||
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
|
||||
{
|
||||
bucket_heap *h = &ca->copygc_heap;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
const struct copygc_heap_entry *l = _l;
|
||||
const struct copygc_heap_entry *r = _r;
|
||||
|
||||
if (bkey_extent_is_data(k.k) &&
|
||||
(ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
|
||||
ca->dev_idx))) {
|
||||
struct bucket_heap_entry search = {
|
||||
.bucket = PTR_BUCKET_NR(ca, ptr)
|
||||
};
|
||||
|
||||
size_t i = eytzinger0_find(h->data, h->used,
|
||||
sizeof(h->data[0]),
|
||||
bucket_idx_cmp, &search);
|
||||
|
||||
if (i < h->used)
|
||||
return ptr;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return (l->offset > r->offset) - (l->offset < r->offset);
|
||||
}
|
||||
|
||||
static int issue_moving_gc_move(struct bch_dev *ca,
|
||||
struct moving_context *ctxt,
|
||||
struct bkey_s_c k)
|
||||
static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
int ret;
|
||||
struct bch_dev *ca = arg;
|
||||
copygc_heap *h = &ca->copygc_heap;
|
||||
const struct bch_extent_ptr *ptr =
|
||||
bch2_extent_has_device(e, ca->dev_idx);
|
||||
|
||||
ptr = moving_pred(ca, k);
|
||||
if (!ptr) /* We raced - bucket's been reused */
|
||||
return 0;
|
||||
if (ptr) {
|
||||
struct copygc_heap_entry search = { .offset = ptr->offset };
|
||||
|
||||
ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
|
||||
if (!ret)
|
||||
trace_gc_copy(k.k);
|
||||
else
|
||||
trace_moving_gc_alloc_fail(c, k.k->size);
|
||||
return ret;
|
||||
}
|
||||
size_t i = eytzinger0_find_le(h->data, h->used,
|
||||
sizeof(h->data[0]),
|
||||
bucket_offset_cmp, &search);
|
||||
|
||||
static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
|
||||
u64 sectors_to_move)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
bucket_heap *h = &ca->copygc_heap;
|
||||
struct moving_context ctxt;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u64 sectors_not_moved = 0;
|
||||
size_t buckets_not_moved = 0;
|
||||
struct bucket_heap_entry *i;
|
||||
|
||||
bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
|
||||
bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
|
||||
BTREE_ITER_PREFETCH);
|
||||
|
||||
while (1) {
|
||||
if (kthread_should_stop())
|
||||
goto out;
|
||||
if (bch2_move_ctxt_wait(&ctxt))
|
||||
goto out;
|
||||
k = bch2_btree_iter_peek(&iter);
|
||||
if (!k.k)
|
||||
break;
|
||||
if (btree_iter_err(k))
|
||||
goto out;
|
||||
|
||||
if (!moving_pred(ca, k))
|
||||
goto next;
|
||||
|
||||
if (issue_moving_gc_move(ca, &ctxt, k)) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(&ctxt);
|
||||
continue;
|
||||
}
|
||||
next:
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
//bch2_btree_iter_cond_resched(&iter);
|
||||
|
||||
/* unlock before calling moving_context_wait() */
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
cond_resched();
|
||||
return (i >= 0 &&
|
||||
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
|
||||
ptr->gen == h->data[i].mark.gen);
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_move_ctxt_exit(&ctxt);
|
||||
trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
|
||||
buckets_to_move);
|
||||
|
||||
/* don't check this if we bailed out early: */
|
||||
for (i = h->data; i < h->data + h->used; i++) {
|
||||
struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
|
||||
|
||||
if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
|
||||
sectors_not_moved += bucket_sectors_used(m);
|
||||
buckets_not_moved++;
|
||||
}
|
||||
}
|
||||
|
||||
if (sectors_not_moved)
|
||||
bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
|
||||
sectors_not_moved, sectors_to_move,
|
||||
buckets_not_moved, buckets_to_move);
|
||||
return;
|
||||
out:
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_move_ctxt_exit(&ctxt);
|
||||
trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
|
||||
buckets_to_move);
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool have_copygc_reserve(struct bch_dev *ca)
|
||||
@ -168,38 +95,17 @@ static bool have_copygc_reserve(struct bch_dev *ca)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int sectors_used_cmp(bucket_heap *heap,
|
||||
struct bucket_heap_entry l,
|
||||
struct bucket_heap_entry r)
|
||||
static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
|
||||
}
|
||||
|
||||
static void bch2_moving_gc(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
copygc_heap *h = &ca->copygc_heap;
|
||||
struct copygc_heap_entry e, *i;
|
||||
struct bucket *g;
|
||||
u64 sectors_to_move = 0;
|
||||
size_t buckets_to_move, buckets_unused = 0;
|
||||
struct bucket_heap_entry e, *i;
|
||||
int reserve_sectors;
|
||||
u64 keys_moved, sectors_moved;
|
||||
u64 sectors_to_move = 0, sectors_not_moved = 0;
|
||||
u64 buckets_to_move, buckets_not_moved = 0;
|
||||
int ret;
|
||||
|
||||
if (!have_copygc_reserve(ca)) {
|
||||
struct closure cl;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
while (1) {
|
||||
closure_wait(&c->freelist_wait, &cl);
|
||||
if (have_copygc_reserve(ca))
|
||||
break;
|
||||
closure_sync(&cl);
|
||||
}
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
}
|
||||
|
||||
reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
|
||||
|
||||
trace_moving_gc_start(ca);
|
||||
closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
|
||||
|
||||
/*
|
||||
* Find buckets with lowest sector counts, skipping completely
|
||||
@ -213,48 +119,73 @@ static void bch2_moving_gc(struct bch_dev *ca)
|
||||
* them:
|
||||
*/
|
||||
down_read(&c->gc_lock);
|
||||
ca->copygc_heap.used = 0;
|
||||
h->used = 0;
|
||||
for_each_bucket(g, ca) {
|
||||
struct bucket_mark m = READ_ONCE(g->mark);
|
||||
struct bucket_heap_entry e = { g - ca->buckets, m };
|
||||
|
||||
if (bucket_unused(m)) {
|
||||
buckets_unused++;
|
||||
continue;
|
||||
}
|
||||
struct copygc_heap_entry e;
|
||||
|
||||
if (m.owned_by_allocator ||
|
||||
m.data_type != BUCKET_DATA)
|
||||
m.data_type != BUCKET_DATA ||
|
||||
!bucket_sectors_used(m) ||
|
||||
bucket_sectors_used(m) >= ca->mi.bucket_size)
|
||||
continue;
|
||||
|
||||
if (bucket_sectors_used(m) >= ca->mi.bucket_size)
|
||||
continue;
|
||||
|
||||
heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
|
||||
e = (struct copygc_heap_entry) {
|
||||
.offset = bucket_to_sector(ca, g - ca->buckets),
|
||||
.mark = m
|
||||
};
|
||||
heap_add_or_replace(h, e, -sectors_used_cmp);
|
||||
}
|
||||
up_read(&c->gc_lock);
|
||||
|
||||
for (i = ca->copygc_heap.data;
|
||||
i < ca->copygc_heap.data + ca->copygc_heap.used;
|
||||
i++)
|
||||
for (i = h->data; i < h->data + h->used; i++)
|
||||
sectors_to_move += bucket_sectors_used(i->mark);
|
||||
|
||||
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
|
||||
BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
|
||||
BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
|
||||
sectors_to_move -= bucket_sectors_used(e.mark);
|
||||
}
|
||||
|
||||
buckets_to_move = ca->copygc_heap.used;
|
||||
buckets_to_move = h->used;
|
||||
|
||||
eytzinger0_sort(ca->copygc_heap.data,
|
||||
ca->copygc_heap.used,
|
||||
sizeof(ca->copygc_heap.data[0]),
|
||||
bucket_idx_cmp, NULL);
|
||||
if (!buckets_to_move)
|
||||
return;
|
||||
|
||||
read_moving(ca, buckets_to_move, sectors_to_move);
|
||||
eytzinger0_sort(h->data, h->used,
|
||||
sizeof(h->data[0]),
|
||||
bucket_offset_cmp, NULL);
|
||||
|
||||
ret = bch2_move_data(c, &ca->copygc_pd.rate,
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE,
|
||||
&ca->self,
|
||||
writepoint_ptr(&ca->copygc_write_point),
|
||||
BTREE_INSERT_USE_RESERVE,
|
||||
ca->dev_idx,
|
||||
copygc_pred, ca,
|
||||
&keys_moved,
|
||||
§ors_moved);
|
||||
|
||||
for (i = h->data; i < h->data + h->used; i++) {
|
||||
size_t bucket = sector_to_bucket(ca, i->offset);
|
||||
struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark);
|
||||
|
||||
if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
|
||||
sectors_not_moved += bucket_sectors_used(m);
|
||||
buckets_not_moved++;
|
||||
}
|
||||
}
|
||||
|
||||
if (sectors_not_moved && !ret)
|
||||
bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
|
||||
sectors_not_moved, sectors_to_move,
|
||||
buckets_not_moved, buckets_to_move);
|
||||
|
||||
trace_copygc(ca,
|
||||
sectors_moved, sectors_not_moved,
|
||||
buckets_to_move, buckets_not_moved);
|
||||
}
|
||||
|
||||
static int bch2_moving_gc_thread(void *arg)
|
||||
static int bch2_copygc_thread(void *arg)
|
||||
{
|
||||
struct bch_dev *ca = arg;
|
||||
struct bch_fs *c = ca->fs;
|
||||
@ -273,7 +204,7 @@ static int bch2_moving_gc_thread(void *arg)
|
||||
* don't start copygc until less than half the gc reserve is
|
||||
* available:
|
||||
*/
|
||||
available = dev_buckets_available(ca);
|
||||
available = dev_buckets_available(c, ca);
|
||||
want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
|
||||
c->opts.gc_reserve_percent, 200);
|
||||
if (available > want) {
|
||||
@ -283,46 +214,46 @@ static int bch2_moving_gc_thread(void *arg)
|
||||
continue;
|
||||
}
|
||||
|
||||
bch2_moving_gc(ca);
|
||||
bch2_copygc(c, ca);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_moving_gc_stop(struct bch_dev *ca)
|
||||
void bch2_copygc_stop(struct bch_dev *ca)
|
||||
{
|
||||
ca->moving_gc_pd.rate.rate = UINT_MAX;
|
||||
bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
|
||||
ca->copygc_pd.rate.rate = UINT_MAX;
|
||||
bch2_ratelimit_reset(&ca->copygc_pd.rate);
|
||||
|
||||
if (ca->moving_gc_read)
|
||||
kthread_stop(ca->moving_gc_read);
|
||||
ca->moving_gc_read = NULL;
|
||||
if (ca->copygc_thread)
|
||||
kthread_stop(ca->copygc_thread);
|
||||
ca->copygc_thread = NULL;
|
||||
}
|
||||
|
||||
int bch2_moving_gc_start(struct bch_dev *ca)
|
||||
int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct task_struct *t;
|
||||
|
||||
BUG_ON(ca->moving_gc_read);
|
||||
BUG_ON(ca->copygc_thread);
|
||||
|
||||
if (ca->fs->opts.nochanges)
|
||||
if (c->opts.nochanges)
|
||||
return 0;
|
||||
|
||||
if (bch2_fs_init_fault("moving_gc_start"))
|
||||
if (bch2_fs_init_fault("copygc_start"))
|
||||
return -ENOMEM;
|
||||
|
||||
t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
|
||||
t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
|
||||
if (IS_ERR(t))
|
||||
return PTR_ERR(t);
|
||||
|
||||
ca->moving_gc_read = t;
|
||||
wake_up_process(ca->moving_gc_read);
|
||||
ca->copygc_thread = t;
|
||||
wake_up_process(ca->copygc_thread);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_dev_moving_gc_init(struct bch_dev *ca)
|
||||
void bch2_dev_copygc_init(struct bch_dev *ca)
|
||||
{
|
||||
bch2_pd_controller_init(&ca->moving_gc_pd);
|
||||
ca->moving_gc_pd.d_term = 0;
|
||||
bch2_pd_controller_init(&ca->copygc_pd);
|
||||
ca->copygc_pd.d_term = 0;
|
||||
}
|
||||
|
@ -1,30 +1,8 @@
|
||||
#ifndef _BCACHEFS_MOVINGGC_H
|
||||
#define _BCACHEFS_MOVINGGC_H
|
||||
|
||||
/*
|
||||
* We can't use the entire copygc reserve in one iteration of copygc: we may
|
||||
* need the buckets we're freeing up to go back into the copygc reserve to make
|
||||
* forward progress, but if the copygc reserve is full they'll be available for
|
||||
* any allocation - and it's possible that in a given iteration, we free up most
|
||||
* of the buckets we're going to free before we allocate most of the buckets
|
||||
* we're going to allocate.
|
||||
*
|
||||
* If we only use half of the reserve per iteration, then in steady state we'll
|
||||
* always have room in the reserve for the buckets we're going to need in the
|
||||
* next iteration:
|
||||
*/
|
||||
#define COPYGC_BUCKETS_PER_ITER(ca) \
|
||||
((ca)->free[RESERVE_MOVINGGC].size / 2)
|
||||
|
||||
/*
|
||||
* Max sectors to move per iteration: Have to take into account internal
|
||||
* fragmentation from the multiple write points for each generation:
|
||||
*/
|
||||
#define COPYGC_SECTORS_PER_ITER(ca) \
|
||||
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
|
||||
|
||||
void bch2_moving_gc_stop(struct bch_dev *);
|
||||
int bch2_moving_gc_start(struct bch_dev *);
|
||||
void bch2_dev_moving_gc_init(struct bch_dev *);
|
||||
void bch2_copygc_stop(struct bch_dev *);
|
||||
int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_dev_copygc_init(struct bch_dev *);
|
||||
|
||||
#endif /* _BCACHEFS_MOVINGGC_H */
|
||||
|
@ -425,6 +425,11 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
|
||||
bch2_sb_get_crypt(sb) &&
|
||||
BCH_SB_INITIALIZED(sb))
|
||||
return "Incompatible extent nonces";
|
||||
|
||||
sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
|
||||
|
||||
return NULL;
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "debug.h"
|
||||
#include "error.h"
|
||||
#include "fs.h"
|
||||
#include "fs-io.h"
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
@ -209,7 +210,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
|
||||
bch2_tiering_stop(c);
|
||||
|
||||
for_each_member_device(ca, c, i)
|
||||
bch2_moving_gc_stop(ca);
|
||||
bch2_copygc_stop(ca);
|
||||
|
||||
bch2_gc_thread_stop(c);
|
||||
|
||||
@ -258,12 +259,8 @@ void bch2_fs_read_only(struct bch_fs *c)
|
||||
*/
|
||||
percpu_ref_kill(&c->writes);
|
||||
|
||||
del_timer(&c->foreground_write_wakeup);
|
||||
cancel_delayed_work(&c->pd_controllers_update);
|
||||
|
||||
c->foreground_write_pd.rate.rate = UINT_MAX;
|
||||
bch2_wake_delayed_writes((unsigned long) c);
|
||||
|
||||
/*
|
||||
* If we're not doing an emergency shutdown, we want to wait on
|
||||
* outstanding writes to complete so they don't see spurious errors due
|
||||
@ -348,9 +345,9 @@ const char *bch2_fs_read_write(struct bch_fs *c)
|
||||
if (bch2_gc_thread_start(c))
|
||||
goto err;
|
||||
|
||||
err = "error starting moving GC thread";
|
||||
err = "error starting copygc thread";
|
||||
for_each_rw_member(ca, c, i)
|
||||
if (bch2_moving_gc_start(ca)) {
|
||||
if (bch2_copygc_start(c, ca)) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto err;
|
||||
}
|
||||
@ -375,6 +372,7 @@ err:
|
||||
|
||||
static void bch2_fs_free(struct bch_fs *c)
|
||||
{
|
||||
bch2_fs_fsio_exit(c);
|
||||
bch2_fs_encryption_exit(c);
|
||||
bch2_fs_btree_cache_exit(c);
|
||||
bch2_fs_journal_exit(&c->journal);
|
||||
@ -411,7 +409,6 @@ static void bch2_fs_exit(struct bch_fs *c)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
del_timer_sync(&c->foreground_write_wakeup);
|
||||
cancel_delayed_work_sync(&c->pd_controllers_update);
|
||||
cancel_work_sync(&c->read_only_work);
|
||||
|
||||
@ -535,8 +532,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
c->tiering_enabled = 1;
|
||||
c->tiering_percent = 10;
|
||||
|
||||
c->foreground_target_percent = 20;
|
||||
|
||||
c->journal.write_time = &c->journal_write_time;
|
||||
c->journal.delay_time = &c->journal_delay_time;
|
||||
c->journal.blocked_time = &c->journal_blocked_time;
|
||||
@ -600,7 +595,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
bch2_fs_btree_cache_init(c) ||
|
||||
bch2_fs_encryption_init(c) ||
|
||||
bch2_fs_compress_init(c) ||
|
||||
bch2_check_set_has_compressed_data(c, c->opts.compression))
|
||||
bch2_check_set_has_compressed_data(c, c->opts.compression) ||
|
||||
bch2_fs_fsio_init(c))
|
||||
goto err;
|
||||
|
||||
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
|
||||
@ -1105,8 +1101,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
||||
ca->dev_idx = dev_idx;
|
||||
__set_bit(ca->dev_idx, ca->self.d);
|
||||
|
||||
writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
|
||||
|
||||
spin_lock_init(&ca->freelist_lock);
|
||||
bch2_dev_moving_gc_init(ca);
|
||||
bch2_dev_copygc_init(ca);
|
||||
|
||||
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
|
||||
|
||||
@ -1224,10 +1222,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
|
||||
if (bch2_dev_sysfs_online(ca))
|
||||
pr_warn("error creating sysfs objects");
|
||||
|
||||
lg_local_lock(&c->usage_lock);
|
||||
if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
|
||||
bch2_mark_dev_metadata(c, ca);
|
||||
lg_local_unlock(&c->usage_lock);
|
||||
bch2_mark_dev_superblock(c, ca, 0);
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_RW)
|
||||
bch2_dev_allocator_add(c, ca);
|
||||
@ -1324,7 +1319,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
|
||||
|
||||
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
bch2_moving_gc_stop(ca);
|
||||
bch2_copygc_stop(ca);
|
||||
|
||||
/*
|
||||
* This stops new data writes (e.g. to existing open data
|
||||
@ -1347,8 +1342,8 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
|
||||
if (bch2_dev_allocator_start(ca))
|
||||
return "error starting allocator thread";
|
||||
|
||||
if (bch2_moving_gc_start(ca))
|
||||
return "error starting moving GC thread";
|
||||
if (bch2_copygc_start(c, ca))
|
||||
return "error starting copygc thread";
|
||||
|
||||
if (bch2_tiering_start(c))
|
||||
return "error starting tiering thread";
|
||||
|
@ -35,6 +35,30 @@ static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
|
||||
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
|
||||
unsigned dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < devs.nr; i++)
|
||||
if (devs.devs[i] == dev)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
|
||||
unsigned dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < devs->nr; i++)
|
||||
if (devs->devs[i] == dev) {
|
||||
array_remove_item(devs->devs, devs->nr, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
|
||||
struct bch_devs_mask *mask)
|
||||
{
|
||||
|
@ -13,4 +13,33 @@ struct bch_devs_mask {
|
||||
unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
|
||||
};
|
||||
|
||||
struct bch_devs_list {
|
||||
u8 nr;
|
||||
u8 devs[BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
struct bch_member_cpu {
|
||||
u64 nbuckets; /* device size */
|
||||
u16 first_bucket; /* index of first bucket used */
|
||||
u16 bucket_size; /* sectors */
|
||||
u8 state;
|
||||
u8 tier;
|
||||
u8 replacement;
|
||||
u8 discard;
|
||||
u8 data_allowed;
|
||||
u8 valid;
|
||||
};
|
||||
|
||||
struct bch_replicas_cpu_entry {
|
||||
u8 data_type;
|
||||
u8 devs[BCH_SB_MEMBERS_MAX / 8];
|
||||
};
|
||||
|
||||
struct bch_replicas_cpu {
|
||||
struct rcu_head rcu;
|
||||
unsigned nr;
|
||||
unsigned entry_size;
|
||||
struct bch_replicas_cpu_entry entries[];
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_SUPER_TYPES_H */
|
||||
|
@ -161,8 +161,11 @@ read_attribute(meta_buckets);
|
||||
read_attribute(alloc_buckets);
|
||||
read_attribute(has_data);
|
||||
read_attribute(alloc_debug);
|
||||
write_attribute(wake_allocator);
|
||||
|
||||
read_attribute(read_realloc_races);
|
||||
read_attribute(extent_migrate_done);
|
||||
read_attribute(extent_migrate_raced);
|
||||
|
||||
rw_attribute(journal_write_delay_ms);
|
||||
rw_attribute(journal_reclaim_delay_ms);
|
||||
@ -170,7 +173,6 @@ rw_attribute(journal_reclaim_delay_ms);
|
||||
rw_attribute(discard);
|
||||
rw_attribute(cache_replacement_policy);
|
||||
|
||||
rw_attribute(foreground_write_ratelimit_enabled);
|
||||
rw_attribute(copy_gc_enabled);
|
||||
sysfs_pd_controller_attribute(copy_gc);
|
||||
|
||||
@ -179,12 +181,9 @@ rw_attribute(tiering_enabled);
|
||||
rw_attribute(tiering_percent);
|
||||
sysfs_pd_controller_attribute(tiering);
|
||||
|
||||
sysfs_pd_controller_attribute(foreground_write);
|
||||
|
||||
rw_attribute(pd_controllers_update_seconds);
|
||||
|
||||
rw_attribute(foreground_target_percent);
|
||||
|
||||
read_attribute(meta_replicas_have);
|
||||
read_attribute(data_replicas_have);
|
||||
|
||||
@ -272,18 +271,18 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
|
||||
if (k.k->type == BCH_EXTENT) {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
const union bch_extent_crc *crc;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
|
||||
if (crc.compression_type == BCH_COMPRESSION_NONE) {
|
||||
nr_uncompressed_extents++;
|
||||
uncompressed_sectors += e.k->size;
|
||||
} else {
|
||||
nr_compressed_extents++;
|
||||
compressed_sectors_compressed +=
|
||||
crc_compressed_size(e.k, crc);
|
||||
crc.compressed_size;
|
||||
compressed_sectors_uncompressed +=
|
||||
crc_uncompressed_size(e.k, crc);
|
||||
crc.uncompressed_size;
|
||||
}
|
||||
|
||||
/* only looking at the first ptr */
|
||||
@ -323,17 +322,17 @@ SHOW(bch2_fs)
|
||||
|
||||
sysfs_print(read_realloc_races,
|
||||
atomic_long_read(&c->read_realloc_races));
|
||||
sysfs_print(extent_migrate_done,
|
||||
atomic_long_read(&c->extent_migrate_done));
|
||||
sysfs_print(extent_migrate_raced,
|
||||
atomic_long_read(&c->extent_migrate_raced));
|
||||
|
||||
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
|
||||
|
||||
sysfs_printf(foreground_write_ratelimit_enabled, "%i",
|
||||
c->foreground_write_ratelimit_enabled);
|
||||
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
|
||||
sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
|
||||
|
||||
sysfs_print(pd_controllers_update_seconds,
|
||||
c->pd_controllers_update_seconds);
|
||||
sysfs_print(foreground_target_percent, c->foreground_target_percent);
|
||||
|
||||
sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
|
||||
sysfs_print(tiering_percent, c->tiering_percent);
|
||||
@ -371,9 +370,6 @@ STORE(__bch2_fs)
|
||||
sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
|
||||
sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
|
||||
|
||||
sysfs_strtoul(foreground_write_ratelimit_enabled,
|
||||
c->foreground_write_ratelimit_enabled);
|
||||
|
||||
if (attr == &sysfs_btree_gc_periodic) {
|
||||
ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
|
||||
?: (ssize_t) size;
|
||||
@ -389,8 +385,8 @@ STORE(__bch2_fs)
|
||||
?: (ssize_t) size;
|
||||
|
||||
for_each_member_device(ca, c, i)
|
||||
if (ca->moving_gc_read)
|
||||
wake_up_process(ca->moving_gc_read);
|
||||
if (ca->copygc_thread)
|
||||
wake_up_process(ca->copygc_thread);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -402,11 +398,8 @@ STORE(__bch2_fs)
|
||||
return ret;
|
||||
}
|
||||
|
||||
sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
|
||||
|
||||
sysfs_strtoul(pd_controllers_update_seconds,
|
||||
c->pd_controllers_update_seconds);
|
||||
sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
|
||||
|
||||
sysfs_strtoul(tiering_percent, c->tiering_percent);
|
||||
sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
|
||||
@ -466,7 +459,6 @@ struct attribute *bch2_fs_files[] = {
|
||||
&sysfs_journal_write_delay_ms,
|
||||
&sysfs_journal_reclaim_delay_ms,
|
||||
|
||||
&sysfs_foreground_target_percent,
|
||||
&sysfs_tiering_percent,
|
||||
|
||||
&sysfs_compression_stats,
|
||||
@ -494,17 +486,17 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
&sysfs_journal_pins,
|
||||
|
||||
&sysfs_read_realloc_races,
|
||||
&sysfs_extent_migrate_done,
|
||||
&sysfs_extent_migrate_raced,
|
||||
|
||||
&sysfs_trigger_journal_flush,
|
||||
&sysfs_trigger_btree_coalesce,
|
||||
&sysfs_trigger_gc,
|
||||
&sysfs_prune_cache,
|
||||
|
||||
&sysfs_foreground_write_ratelimit_enabled,
|
||||
&sysfs_copy_gc_enabled,
|
||||
&sysfs_tiering_enabled,
|
||||
sysfs_pd_controller_files(tiering),
|
||||
sysfs_pd_controller_files(foreground_write),
|
||||
&sysfs_internal_uuid,
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
|
||||
@ -710,17 +702,23 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
|
||||
static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_dev_usage stats = bch2_dev_usage_read(ca);
|
||||
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
|
||||
|
||||
return scnprintf(buf, PAGE_SIZE,
|
||||
"free_inc: %zu/%zu\n"
|
||||
"free[RESERVE_BTREE]: %zu/%zu\n"
|
||||
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
|
||||
"free[RESERVE_NONE]: %zu/%zu\n"
|
||||
"alloc: %llu/%llu\n"
|
||||
"meta: %llu/%llu\n"
|
||||
"dirty: %llu/%llu\n"
|
||||
"available: %llu/%llu\n"
|
||||
"buckets:\n"
|
||||
" capacity: %llu\n"
|
||||
" alloc: %llu\n"
|
||||
" meta: %llu\n"
|
||||
" dirty: %llu\n"
|
||||
" available: %llu\n"
|
||||
"sectors:\n"
|
||||
" meta: %llu\n"
|
||||
" dirty: %llu\n"
|
||||
" cached: %llu\n"
|
||||
"freelist_wait: %s\n"
|
||||
"open buckets: %u/%u (reserved %u)\n"
|
||||
"open_buckets_wait: %s\n",
|
||||
@ -728,10 +726,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
|
||||
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
|
||||
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
|
||||
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
|
||||
stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket,
|
||||
stats.buckets[S_META], ca->mi.nbuckets - ca->mi.first_bucket,
|
||||
stats.buckets[S_DIRTY], ca->mi.nbuckets - ca->mi.first_bucket,
|
||||
__dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket,
|
||||
ca->mi.nbuckets - ca->mi.first_bucket,
|
||||
stats.buckets_alloc,
|
||||
stats.buckets[S_META],
|
||||
stats.buckets[S_DIRTY],
|
||||
__dev_buckets_available(ca, stats),
|
||||
stats.sectors[S_META],
|
||||
stats.sectors[S_DIRTY],
|
||||
stats.sectors_cached,
|
||||
c->freelist_wait.list.first ? "waiting" : "empty",
|
||||
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
|
||||
c->open_buckets_wait.list.first ? "waiting" : "empty");
|
||||
@ -769,7 +771,7 @@ SHOW(bch2_dev)
|
||||
{
|
||||
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_dev_usage stats = bch2_dev_usage_read(ca);
|
||||
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
|
||||
char *out = buf, *end = buf + PAGE_SIZE;
|
||||
|
||||
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
|
||||
@ -788,8 +790,8 @@ SHOW(bch2_dev)
|
||||
sysfs_print(cached_buckets, stats.buckets_cached);
|
||||
sysfs_print(meta_buckets, stats.buckets[S_META]);
|
||||
sysfs_print(alloc_buckets, stats.buckets_alloc);
|
||||
sysfs_print(available_buckets, dev_buckets_available(ca));
|
||||
sysfs_print(free_buckets, dev_buckets_free(ca));
|
||||
sysfs_print(available_buckets, __dev_buckets_available(ca, stats));
|
||||
sysfs_print(free_buckets, __dev_buckets_free(ca, stats));
|
||||
|
||||
if (attr == &sysfs_has_data) {
|
||||
out += bch2_scnprint_flag_list(out, end - out,
|
||||
@ -799,7 +801,7 @@ SHOW(bch2_dev)
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
|
||||
sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
|
||||
|
||||
if (attr == &sysfs_cache_replacement_policy) {
|
||||
out += bch2_scnprint_string_list(out, end - out,
|
||||
@ -843,7 +845,7 @@ STORE(bch2_dev)
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_member *mi;
|
||||
|
||||
sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
|
||||
sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
|
||||
|
||||
if (attr == &sysfs_discard) {
|
||||
bool v = strtoul_or_return(buf);
|
||||
@ -899,6 +901,9 @@ STORE(bch2_dev)
|
||||
bch2_tiering_start(c);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_wake_allocator)
|
||||
bch2_wake_allocator(ca);
|
||||
|
||||
return size;
|
||||
}
|
||||
SYSFS_OPS(bch2_dev);
|
||||
@ -942,6 +947,7 @@ struct attribute *bch2_dev_files[] = {
|
||||
|
||||
/* debug: */
|
||||
&sysfs_alloc_debug,
|
||||
&sysfs_wake_allocator,
|
||||
|
||||
sysfs_pd_controller_files(copy_gc),
|
||||
NULL
|
||||
|
@ -15,105 +15,23 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
struct tiering_state {
|
||||
struct bch_tier *tier;
|
||||
unsigned sectors;
|
||||
unsigned stripe_size;
|
||||
unsigned dev_idx;
|
||||
struct bch_dev *ca;
|
||||
};
|
||||
|
||||
static bool tiering_pred(struct bch_fs *c,
|
||||
struct bch_tier *tier,
|
||||
struct bkey_s_c k)
|
||||
static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
|
||||
{
|
||||
if (bkey_extent_is_data(k.k)) {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
unsigned replicas = 0;
|
||||
struct bch_tier *tier = arg;
|
||||
struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
unsigned replicas = 0;
|
||||
|
||||
/* Make sure we have room to add a new pointer: */
|
||||
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
|
||||
BKEY_EXTENT_VAL_U64s_MAX)
|
||||
return false;
|
||||
/* Make sure we have room to add a new pointer: */
|
||||
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
|
||||
BKEY_EXTENT_VAL_U64s_MAX)
|
||||
return false;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (c->devs[ptr->dev]->mi.tier >= tier->idx)
|
||||
replicas++;
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (c->devs[ptr->dev]->mi.tier >= tier->idx)
|
||||
replicas++;
|
||||
|
||||
return replicas < c->opts.data_replicas;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int issue_tiering_move(struct bch_fs *c,
|
||||
struct bch_tier *tier,
|
||||
struct moving_context *ctxt,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
|
||||
if (!ret)
|
||||
trace_tiering_copy(k.k);
|
||||
else
|
||||
trace_tiering_alloc_fail(c, k.k->size);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* tiering_next_cache - issue a move to write an extent to the next cache
|
||||
* device in round robin order
|
||||
*/
|
||||
static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
|
||||
{
|
||||
struct moving_context ctxt;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
unsigned nr_devices = dev_mask_nr(&tier->devs);
|
||||
int ret;
|
||||
|
||||
if (!nr_devices)
|
||||
return 0;
|
||||
|
||||
trace_tiering_start(c);
|
||||
|
||||
bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
|
||||
nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
|
||||
BTREE_ITER_PREFETCH);
|
||||
|
||||
while (!kthread_should_stop() &&
|
||||
!bch2_move_ctxt_wait(&ctxt) &&
|
||||
(k = bch2_btree_iter_peek(&iter)).k &&
|
||||
!btree_iter_err(k)) {
|
||||
if (!tiering_pred(c, tier, k))
|
||||
goto next;
|
||||
|
||||
ret = issue_tiering_move(c, tier, &ctxt, k);
|
||||
if (ret) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(&ctxt);
|
||||
continue;
|
||||
}
|
||||
next:
|
||||
bch2_btree_iter_advance_pos(&iter);
|
||||
//bch2_btree_iter_cond_resched(&iter);
|
||||
|
||||
/* unlock before calling moving_context_wait() */
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_move_ctxt_exit(&ctxt);
|
||||
trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
|
||||
|
||||
return ctxt.sectors_moved;
|
||||
return replicas < c->opts.data_replicas;
|
||||
}
|
||||
|
||||
static int bch2_tiering_thread(void *arg)
|
||||
@ -122,15 +40,15 @@ static int bch2_tiering_thread(void *arg)
|
||||
struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
|
||||
struct io_clock *clock = &c->io_clock[WRITE];
|
||||
struct bch_dev *ca;
|
||||
u64 tier_capacity, available_sectors;
|
||||
u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
|
||||
unsigned long last;
|
||||
unsigned i;
|
||||
unsigned i, nr_devices;
|
||||
|
||||
set_freezable();
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
if (kthread_wait_freezable(c->tiering_enabled &&
|
||||
dev_mask_nr(&tier->devs)))
|
||||
(nr_devices = dev_mask_nr(&tier->devs))))
|
||||
break;
|
||||
|
||||
while (1) {
|
||||
@ -151,7 +69,7 @@ static int bch2_tiering_thread(void *arg)
|
||||
ca->mi.first_bucket);
|
||||
available_sectors +=
|
||||
bucket_to_sector(ca,
|
||||
dev_buckets_available(ca));
|
||||
dev_buckets_available(c, ca));
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
@ -167,7 +85,15 @@ static int bch2_tiering_thread(void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
read_tiering(c, tier);
|
||||
bch2_move_data(c, &tier->pd.rate,
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
|
||||
&tier->devs,
|
||||
writepoint_ptr(&tier->wp),
|
||||
0,
|
||||
-1,
|
||||
tiering_pred, tier,
|
||||
&keys_moved,
|
||||
§ors_moved);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -291,13 +291,15 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
|
||||
|
||||
int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
|
||||
{
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
|
||||
while (1) {
|
||||
u64 delay = bch2_ratelimit_delay(d);
|
||||
|
||||
if (delay)
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
|
||||
if (kthread_should_stop())
|
||||
if (kthread && kthread_should_stop())
|
||||
return 1;
|
||||
|
||||
if (!delay)
|
||||
@ -434,8 +436,11 @@ size_t bch2_rand_range(size_t max)
|
||||
{
|
||||
size_t rand;
|
||||
|
||||
if (!max)
|
||||
return 0;
|
||||
|
||||
do {
|
||||
get_random_bytes(&rand, sizeof(rand));
|
||||
rand = get_random_long();
|
||||
rand &= roundup_pow_of_two(max) - 1;
|
||||
} while (rand >= max);
|
||||
|
||||
@ -642,3 +647,129 @@ void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
|
||||
|
||||
return vpmalloc(size, gfp_mask);
|
||||
}
|
||||
|
||||
#if 0
|
||||
void eytzinger1_test(void)
|
||||
{
|
||||
unsigned inorder, eytz, size;
|
||||
|
||||
pr_info("1 based eytzinger test:");
|
||||
|
||||
for (size = 2;
|
||||
size < 65536;
|
||||
size++) {
|
||||
unsigned extra = eytzinger1_extra(size);
|
||||
|
||||
if (!(size % 4096))
|
||||
pr_info("tree size %u", size);
|
||||
|
||||
BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
|
||||
BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
|
||||
|
||||
BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
|
||||
BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
|
||||
|
||||
inorder = 1;
|
||||
eytzinger1_for_each(eytz, size) {
|
||||
BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
|
||||
BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
|
||||
BUG_ON(eytz != eytzinger1_last(size) &&
|
||||
eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
|
||||
|
||||
inorder++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void eytzinger0_test(void)
|
||||
{
|
||||
|
||||
unsigned inorder, eytz, size;
|
||||
|
||||
pr_info("0 based eytzinger test:");
|
||||
|
||||
for (size = 1;
|
||||
size < 65536;
|
||||
size++) {
|
||||
unsigned extra = eytzinger0_extra(size);
|
||||
|
||||
if (!(size % 4096))
|
||||
pr_info("tree size %u", size);
|
||||
|
||||
BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
|
||||
BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
|
||||
|
||||
BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
|
||||
BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
|
||||
|
||||
inorder = 0;
|
||||
eytzinger0_for_each(eytz, size) {
|
||||
BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
|
||||
BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
|
||||
BUG_ON(eytz != eytzinger0_last(size) &&
|
||||
eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
|
||||
|
||||
inorder++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int cmp_u16(const void *_l, const void *_r, size_t size)
|
||||
{
|
||||
const u16 *l = _l, *r = _r;
|
||||
|
||||
return (*l > *r) - (*r - *l);
|
||||
}
|
||||
|
||||
static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
|
||||
{
|
||||
int i, c1 = -1, c2 = -1;
|
||||
ssize_t r;
|
||||
|
||||
r = eytzinger0_find_le(test_array, nr,
|
||||
sizeof(test_array[0]),
|
||||
cmp_u16, &search);
|
||||
if (r >= 0)
|
||||
c1 = test_array[r];
|
||||
|
||||
for (i = 0; i < nr; i++)
|
||||
if (test_array[i] <= search && test_array[i] > c2)
|
||||
c2 = test_array[i];
|
||||
|
||||
if (c1 != c2) {
|
||||
eytzinger0_for_each(i, nr)
|
||||
pr_info("[%3u] = %12u", i, test_array[i]);
|
||||
pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
|
||||
i, r, c1, c2);
|
||||
}
|
||||
}
|
||||
|
||||
void eytzinger0_find_test(void)
|
||||
{
|
||||
unsigned i, nr, allocated = 1 << 12;
|
||||
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
|
||||
|
||||
for (nr = 1; nr < allocated; nr++) {
|
||||
pr_info("testing %u elems", nr);
|
||||
|
||||
get_random_bytes(test_array, nr * sizeof(test_array[0]));
|
||||
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
|
||||
|
||||
/* verify array is sorted correctly: */
|
||||
eytzinger0_for_each(i, nr)
|
||||
BUG_ON(i != eytzinger0_last(nr) &&
|
||||
test_array[i] > test_array[eytzinger0_next(i, nr)]);
|
||||
|
||||
for (i = 0; i < U16_MAX; i += 1 << 12)
|
||||
eytzinger0_find_test_val(test_array, nr, i);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
|
||||
eytzinger0_find_test_val(test_array, nr, test_array[i]);
|
||||
eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
|
||||
}
|
||||
}
|
||||
|
||||
kfree(test_array);
|
||||
}
|
||||
#endif
|
||||
|
@ -789,4 +789,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
|
||||
int (*cmp_func)(const void *, const void *, size_t),
|
||||
void (*swap_func)(void *, void *, size_t));
|
||||
|
||||
/* just the memmove, doesn't update @_nr */
|
||||
#define __array_insert_item(_array, _nr, _pos) \
|
||||
memmove(&(_array)[(_pos) + 1], \
|
||||
&(_array)[(_pos)], \
|
||||
sizeof((_array)[0]) * ((_nr) - (_pos)))
|
||||
|
||||
#define array_insert_item(_array, _nr, _pos, _new_item) \
|
||||
do { \
|
||||
__array_insert_item(_array, _nr, _pos); \
|
||||
(_nr)++; \
|
||||
(_array)[(_pos)] = (_new_item); \
|
||||
} while (0)
|
||||
|
||||
#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
|
||||
do { \
|
||||
(_nr) -= (_nr_to_remove); \
|
||||
memmove(&(_array)[(_pos)], \
|
||||
&(_array)[(_pos) + (_nr_to_remove)], \
|
||||
sizeof((_array)[0]) * ((_nr) - (_pos))); \
|
||||
} while (0)
|
||||
|
||||
#define array_remove_item(_array, _nr, _pos) \
|
||||
array_remove_items(_array, _nr, _pos, 1)
|
||||
|
||||
#endif /* _BCACHEFS_UTIL_H */
|
||||
|
Loading…
Reference in New Issue
Block a user