mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-23 00:00:02 +03:00
Update bcachefs sources to 0906b1fb49 bcachefs: fixes for 32 bit/big endian machines
This commit is contained in:
parent
800408be11
commit
ff86d47221
@ -1 +1 @@
|
||||
ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
|
||||
0906b1fb492e8e84f563b192fd8f458af1c1d420
|
||||
|
@ -36,10 +36,12 @@ static void usage(void)
|
||||
" fsck Check an existing filesystem for errors\n"
|
||||
"\n"
|
||||
"Startup/shutdown, assembly of multi device filesystems:\n"
|
||||
#if 0
|
||||
" assemble Assemble an existing multi device filesystem\n"
|
||||
" incremental Incrementally assemble an existing multi device filesystem\n"
|
||||
" run Start a partially assembled filesystem\n"
|
||||
" stop Stop a running filesystem\n"
|
||||
#endif
|
||||
"\n"
|
||||
"Commands for managing a running filesystem:\n"
|
||||
" fs usage Show disk usage\n"
|
||||
@ -150,6 +152,7 @@ int main(int argc, char *argv[])
|
||||
if (!strcmp(cmd, "fsck"))
|
||||
return cmd_fsck(argc, argv);
|
||||
|
||||
#if 0
|
||||
if (!strcmp(cmd, "assemble"))
|
||||
return cmd_assemble(argc, argv);
|
||||
if (!strcmp(cmd, "incremental"))
|
||||
@ -158,6 +161,7 @@ int main(int argc, char *argv[])
|
||||
return cmd_run(argc, argv);
|
||||
if (!strcmp(cmd, "stop"))
|
||||
return cmd_stop(argc, argv);
|
||||
#endif
|
||||
|
||||
if (!strcmp(cmd, "fs"))
|
||||
return fs_cmds(argc, argv);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "cmds.h"
|
||||
#include "libbcachefs.h"
|
||||
|
||||
#if 0
|
||||
int cmd_assemble(int argc, char *argv[])
|
||||
{
|
||||
unsigned nr_devs = argc - 1;
|
||||
@ -26,7 +27,7 @@ int cmd_assemble(int argc, char *argv[])
|
||||
|
||||
unsigned i;
|
||||
for (i = 0; i < nr_devs; i++)
|
||||
assemble->devs[i] = (__u64) argv[i + 1];
|
||||
assemble->devs[i] = (unsigned long) argv[i + 1];
|
||||
|
||||
xioctl(bcachectl_open(), BCH_IOCTL_ASSEMBLE, assemble);
|
||||
return 0;
|
||||
@ -38,9 +39,10 @@ int cmd_incremental(int argc, char *argv[])
|
||||
die("Please supply exactly one device");
|
||||
|
||||
struct bch_ioctl_incremental incremental = {
|
||||
.dev = (__u64) argv[1],
|
||||
.dev = (unsigned long) argv[1],
|
||||
};
|
||||
|
||||
xioctl(bcachectl_open(), BCH_IOCTL_INCREMENTAL, &incremental);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "libbcachefs/bcachefs.h"
|
||||
#include "libbcachefs/alloc.h"
|
||||
#include "libbcachefs/bset.h"
|
||||
#include "libbcachefs/btree_cache.h"
|
||||
#include "libbcachefs/btree_iter.h"
|
||||
#include "libbcachefs/buckets.h"
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "cmds.h"
|
||||
#include "libbcachefs.h"
|
||||
|
||||
#if 0
|
||||
int cmd_run(int argc, char *argv[])
|
||||
{
|
||||
return 0;
|
||||
@ -29,3 +30,4 @@ int cmd_stop(int argc, char *argv[])
|
||||
xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
2
cmds.h
2
cmds.h
@ -12,10 +12,12 @@
|
||||
int cmd_format(int argc, char *argv[]);
|
||||
int cmd_show_super(int argc, char *argv[]);
|
||||
|
||||
#if 0
|
||||
int cmd_assemble(int argc, char *argv[]);
|
||||
int cmd_incremental(int argc, char *argv[]);
|
||||
int cmd_run(int argc, char *argv[]);
|
||||
int cmd_stop(int argc, char *argv[]);
|
||||
#endif
|
||||
|
||||
int cmd_fs_usage(int argc, char *argv[]);
|
||||
|
||||
|
@ -6,27 +6,22 @@
|
||||
|
||||
struct timer_list {
|
||||
unsigned long expires;
|
||||
void (*function)(unsigned long);
|
||||
unsigned long data;
|
||||
void (*function)(struct timer_list *timer);
|
||||
bool pending;
|
||||
};
|
||||
|
||||
static inline void init_timer(struct timer_list *timer)
|
||||
static inline void timer_setup(struct timer_list *timer,
|
||||
void (*func)(struct timer_list *),
|
||||
unsigned int flags)
|
||||
{
|
||||
memset(timer, 0, sizeof(*timer));
|
||||
timer->function = func;
|
||||
}
|
||||
|
||||
#define __init_timer(_timer, _flags) init_timer(_timer)
|
||||
#define timer_setup_on_stack(timer, callback, flags) \
|
||||
timer_setup(timer, callback, flags)
|
||||
|
||||
#define __setup_timer(_timer, _fn, _data, _flags) \
|
||||
do { \
|
||||
__init_timer((_timer), (_flags)); \
|
||||
(_timer)->function = (_fn); \
|
||||
(_timer)->data = (_data); \
|
||||
} while (0)
|
||||
|
||||
#define setup_timer(timer, fn, data) \
|
||||
__setup_timer((timer), (fn), (data), 0)
|
||||
#define destroy_timer_on_stack(timer) do {} while (0)
|
||||
|
||||
static inline int timer_pending(const struct timer_list *timer)
|
||||
{
|
||||
@ -36,8 +31,9 @@ static inline int timer_pending(const struct timer_list *timer)
|
||||
int del_timer(struct timer_list * timer);
|
||||
int del_timer_sync(struct timer_list *timer);
|
||||
|
||||
#define del_singleshot_timer_sync(timer) del_timer_sync(timer)
|
||||
|
||||
int mod_timer(struct timer_list *timer, unsigned long expires);
|
||||
//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
|
||||
|
||||
static inline void add_timer(struct timer_list *timer)
|
||||
{
|
||||
|
@ -8,7 +8,7 @@ struct task_struct;
|
||||
struct workqueue_struct;
|
||||
struct work_struct;
|
||||
typedef void (*work_func_t)(struct work_struct *work);
|
||||
void delayed_work_timer_fn(unsigned long __data);
|
||||
void delayed_work_timer_fn(struct timer_list *);
|
||||
|
||||
#define work_data_bits(work) ((unsigned long *)(&(work)->data))
|
||||
|
||||
@ -44,9 +44,7 @@ struct delayed_work {
|
||||
#define INIT_DELAYED_WORK(_work, _func) \
|
||||
do { \
|
||||
INIT_WORK(&(_work)->work, (_func)); \
|
||||
__setup_timer(&(_work)->timer, delayed_work_timer_fn, \
|
||||
(unsigned long)(_work), \
|
||||
TIMER_IRQSAFE); \
|
||||
timer_setup(&(_work)->timer, delayed_work_timer_fn, 0); \
|
||||
} while (0)
|
||||
|
||||
static inline struct delayed_work *to_delayed_work(struct work_struct *work)
|
||||
|
@ -1393,13 +1393,11 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = wp->first_ptr - 1; i >= 0; --i) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
|
||||
|
||||
if (dev_in_target(ca, target) == in_target)
|
||||
for (i = wp->first_ptr - 1; i >= 0; --i)
|
||||
if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
|
||||
target) == in_target)
|
||||
writepoint_drop_ptr(c, wp, i);
|
||||
}
|
||||
}
|
||||
|
||||
static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
|
||||
{
|
||||
@ -1555,7 +1553,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
||||
/* does writepoint have ptrs we don't want to use? */
|
||||
if (target)
|
||||
writepoint_for_each_ptr(wp, ob, i)
|
||||
if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
|
||||
if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
|
||||
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
|
||||
wp->first_ptr++;
|
||||
}
|
||||
@ -1590,7 +1588,8 @@ alloc_done:
|
||||
* one in the target we want:
|
||||
*/
|
||||
if (cache_idx >= 0) {
|
||||
if (!dev_in_target(ca, target)) {
|
||||
if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
|
||||
target)) {
|
||||
writepoint_drop_ptr(c, wp, i);
|
||||
} else {
|
||||
writepoint_drop_ptr(c, wp, cache_idx);
|
||||
@ -1621,7 +1620,7 @@ alloc_done:
|
||||
|
||||
if (ca->mi.durability &&
|
||||
ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
|
||||
!dev_idx_in_target(c, ob->ptr.dev, target)) {
|
||||
!bch2_dev_in_target(c, ob->ptr.dev, target)) {
|
||||
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
|
||||
wp->first_ptr++;
|
||||
nr_ptrs_effective -= ca->mi.durability;
|
||||
@ -1890,8 +1889,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
|
||||
/* stop allocator thread: */
|
||||
void bch2_dev_allocator_stop(struct bch_dev *ca)
|
||||
{
|
||||
struct task_struct *p = ca->alloc_thread;
|
||||
struct task_struct *p;
|
||||
|
||||
p = rcu_dereference_protected(ca->alloc_thread, 1);
|
||||
ca->alloc_thread = NULL;
|
||||
|
||||
/*
|
||||
@ -1926,7 +1926,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
|
||||
return PTR_ERR(p);
|
||||
|
||||
get_task_struct(p);
|
||||
ca->alloc_thread = p;
|
||||
rcu_assign_pointer(ca->alloc_thread, p);
|
||||
wake_up_process(p);
|
||||
return 0;
|
||||
}
|
||||
@ -2099,7 +2099,7 @@ again:
|
||||
if (btree_node_dirty(b) && (!b->written || b->level)) {
|
||||
if (btree_node_may_write(b)) {
|
||||
rcu_read_unlock();
|
||||
six_lock_read(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
bch2_btree_node_write(c, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->lock);
|
||||
goto again;
|
||||
|
@ -103,7 +103,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
|
||||
struct task_struct *p;
|
||||
|
||||
rcu_read_lock();
|
||||
if ((p = READ_ONCE(ca->alloc_thread)))
|
||||
p = rcu_dereference(ca->alloc_thread);
|
||||
if (p)
|
||||
wake_up_process(p);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
@ -197,7 +197,6 @@
|
||||
#include <linux/zstd.h>
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
#include "bset.h"
|
||||
#include "fifo.h"
|
||||
#include "opts.h"
|
||||
#include "util.h"
|
||||
@ -272,25 +271,37 @@ do { \
|
||||
#endif
|
||||
|
||||
#define BCH_TIME_STATS() \
|
||||
BCH_TIME_STAT(btree_node_mem_alloc) \
|
||||
BCH_TIME_STAT(btree_gc) \
|
||||
BCH_TIME_STAT(btree_split) \
|
||||
BCH_TIME_STAT(btree_sort) \
|
||||
BCH_TIME_STAT(btree_read) \
|
||||
BCH_TIME_STAT(data_write) \
|
||||
BCH_TIME_STAT(data_read) \
|
||||
BCH_TIME_STAT(data_promote) \
|
||||
BCH_TIME_STAT(journal_write) \
|
||||
BCH_TIME_STAT(journal_delay) \
|
||||
BCH_TIME_STAT(journal_blocked) \
|
||||
BCH_TIME_STAT(journal_flush_seq)
|
||||
x(btree_node_mem_alloc) \
|
||||
x(btree_gc) \
|
||||
x(btree_split) \
|
||||
x(btree_sort) \
|
||||
x(btree_read) \
|
||||
x(btree_lock_contended_read) \
|
||||
x(btree_lock_contended_intent) \
|
||||
x(btree_lock_contended_write) \
|
||||
x(data_write) \
|
||||
x(data_read) \
|
||||
x(data_promote) \
|
||||
x(journal_write) \
|
||||
x(journal_delay) \
|
||||
x(journal_blocked) \
|
||||
x(journal_flush_seq)
|
||||
|
||||
enum bch_time_stats {
|
||||
#define x(name) BCH_TIME_##name,
|
||||
BCH_TIME_STATS()
|
||||
#undef x
|
||||
BCH_TIME_STAT_NR
|
||||
};
|
||||
|
||||
#include "alloc_types.h"
|
||||
#include "btree_types.h"
|
||||
#include "buckets_types.h"
|
||||
#include "clock_types.h"
|
||||
#include "journal_types.h"
|
||||
#include "keylist_types.h"
|
||||
#include "quota_types.h"
|
||||
#include "rebalance_types.h"
|
||||
#include "super_types.h"
|
||||
|
||||
/*
|
||||
@ -372,7 +383,7 @@ struct bch_dev {
|
||||
struct bch_dev_usage usage_cached;
|
||||
|
||||
/* Allocator: */
|
||||
struct task_struct *alloc_thread;
|
||||
struct task_struct __rcu *alloc_thread;
|
||||
|
||||
/*
|
||||
* free: Buckets that are ready to be used
|
||||
@ -447,7 +458,6 @@ enum {
|
||||
/* shutdown: */
|
||||
BCH_FS_EMERGENCY_RO,
|
||||
BCH_FS_WRITE_DISABLE_COMPLETE,
|
||||
BCH_FS_GC_STOPPING,
|
||||
|
||||
/* errors: */
|
||||
BCH_FS_ERROR,
|
||||
@ -570,12 +580,6 @@ struct bch_fs {
|
||||
struct delayed_work pd_controllers_update;
|
||||
unsigned pd_controllers_update_seconds;
|
||||
|
||||
/* REBALANCE */
|
||||
struct task_struct *rebalance_thread;
|
||||
struct bch_pd_controller rebalance_pd;
|
||||
|
||||
atomic64_t rebalance_work_unknown_dev;
|
||||
|
||||
struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
||||
|
||||
u64 capacity; /* sectors */
|
||||
@ -664,6 +668,9 @@ struct bch_fs {
|
||||
|
||||
atomic64_t key_version;
|
||||
|
||||
/* REBALANCE */
|
||||
struct bch_fs_rebalance rebalance;
|
||||
|
||||
/* VFS IO PATH - fs-io.c */
|
||||
struct bio_set writepage_bioset;
|
||||
struct bio_set dio_write_bioset;
|
||||
@ -714,18 +721,13 @@ struct bch_fs {
|
||||
|
||||
unsigned btree_gc_periodic:1;
|
||||
unsigned copy_gc_enabled:1;
|
||||
unsigned rebalance_enabled:1;
|
||||
unsigned rebalance_percent;
|
||||
bool promote_whole_extents;
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) bool name;
|
||||
BCH_DEBUG_PARAMS_ALL()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
#define BCH_TIME_STAT(name) \
|
||||
struct time_stats name##_time;
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
struct time_stats times[BCH_TIME_STAT_NR];
|
||||
};
|
||||
|
||||
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
|
||||
|
@ -3,6 +3,72 @@
|
||||
|
||||
/*
|
||||
* bcachefs on disk data structures
|
||||
*
|
||||
* OVERVIEW:
|
||||
*
|
||||
* There are three main types of on disk data structures in bcachefs (this is
|
||||
* reduced from 5 in bcache)
|
||||
*
|
||||
* - superblock
|
||||
* - journal
|
||||
* - btree
|
||||
*
|
||||
* The btree is the primary structure; most metadata exists as keys in the
|
||||
* various btrees. There are only a small number of btrees, they're not
|
||||
* sharded - we have one btree for extents, another for inodes, et cetera.
|
||||
*
|
||||
* SUPERBLOCK:
|
||||
*
|
||||
* The superblock contains the location of the journal, the list of devices in
|
||||
* the filesystem, and in general any metadata we need in order to decide
|
||||
* whether we can start a filesystem or prior to reading the journal/btree
|
||||
* roots.
|
||||
*
|
||||
* The superblock is extensible, and most of the contents of the superblock are
|
||||
* in variable length, type tagged fields; see struct bch_sb_field.
|
||||
*
|
||||
* Backup superblocks do not reside in a fixed location; also, superblocks do
|
||||
* not have a fixed size. To locate backup superblocks we have struct
|
||||
* bch_sb_layout; we store a copy of this inside every superblock, and also
|
||||
* before the first superblock.
|
||||
*
|
||||
* JOURNAL:
|
||||
*
|
||||
* The journal primarily records btree updates in the order they occurred;
|
||||
* journal replay consists of just iterating over all the keys in the open
|
||||
* journal entries and re-inserting them into the btrees.
|
||||
*
|
||||
* The journal also contains entry types for the btree roots, and blacklisted
|
||||
* journal sequence numbers (see journal_seq_blacklist.c).
|
||||
*
|
||||
* BTREE:
|
||||
*
|
||||
* bcachefs btrees are copy on write b+ trees, where nodes are big (typically
|
||||
* 128k-256k) and log structured. We use struct btree_node for writing the first
|
||||
* entry in a given node (offset 0), and struct btree_node_entry for all
|
||||
* subsequent writes.
|
||||
*
|
||||
* After the header, btree node entries contain a list of keys in sorted order.
|
||||
* Values are stored inline with the keys; since values are variable length (and
|
||||
* keys effectively are variable length too, due to packing) we can't do random
|
||||
* access without building up additional in memory tables in the btree node read
|
||||
* path.
|
||||
*
|
||||
* BTREE KEYS (struct bkey):
|
||||
*
|
||||
* The various btrees share a common format for the key - so as to avoid
|
||||
* switching in fastpath lookup/comparison code - but define their own
|
||||
* structures for the key values.
|
||||
*
|
||||
* The size of a key/value pair is stored as a u8 in units of u64s, so the max
|
||||
* size is just under 2k. The common part also contains a type tag for the
|
||||
* value, and a format field indicating whether the key is packed or not (and
|
||||
* also meant to allow adding new key fields in the future, if desired).
|
||||
*
|
||||
* bkeys, when stored within a btree node, may also be packed. In that case, the
|
||||
* bkey_format in that node is used to unpack it. Packed bkeys mean that we can
|
||||
* be generous with field sizes in the common part of the key format (64 bit
|
||||
* inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
|
||||
*/
|
||||
|
||||
#include <asm/types.h>
|
||||
@ -44,12 +110,19 @@ struct bkey_format {
|
||||
/* Btree keys - all units are in sectors */
|
||||
|
||||
struct bpos {
|
||||
/* Word order matches machine byte order */
|
||||
#if defined(__LITTLE_ENDIAN)
|
||||
/*
|
||||
* Word order matches machine byte order - btree code treats a bpos as a
|
||||
* single large integer, for search/comparison purposes
|
||||
*
|
||||
* Note that wherever a bpos is embedded in another on disk data
|
||||
* structure, it has to be byte swabbed when reading in metadata that
|
||||
* wasn't written in native endian order:
|
||||
*/
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
__u32 snapshot;
|
||||
__u64 offset;
|
||||
__u64 inode;
|
||||
#elif defined(__BIG_ENDIAN)
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
__u64 inode;
|
||||
__u64 offset; /* Points to end of extent - sectors */
|
||||
__u32 snapshot;
|
||||
@ -83,10 +156,10 @@ struct bch_val {
|
||||
};
|
||||
|
||||
struct bversion {
|
||||
#if defined(__LITTLE_ENDIAN)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
__u64 lo;
|
||||
__u32 hi;
|
||||
#elif defined(__BIG_ENDIAN)
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
__u32 hi;
|
||||
__u64 lo;
|
||||
#endif
|
||||
@ -110,13 +183,13 @@ struct bkey {
|
||||
/* Type of the value */
|
||||
__u8 type;
|
||||
|
||||
#if defined(__LITTLE_ENDIAN)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
__u8 pad[1];
|
||||
|
||||
struct bversion version;
|
||||
__u32 size; /* extent size, in sectors */
|
||||
struct bpos p;
|
||||
#elif defined(__BIG_ENDIAN)
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
struct bpos p;
|
||||
__u32 size; /* extent size, in sectors */
|
||||
struct bversion version;
|
||||
@ -275,10 +348,10 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE);
|
||||
*
|
||||
* If an extent is not checksummed or compressed, when the extent is trimmed we
|
||||
* don't have to remember the extent we originally allocated and wrote: we can
|
||||
* merely adjust ptr->offset to point to the start of the start of the data that
|
||||
* is currently live. The size field in struct bkey records the current (live)
|
||||
* size of the extent, and is also used to mean "size of region on disk that we
|
||||
* point to" in this case.
|
||||
* merely adjust ptr->offset to point to the start of the data that is currently
|
||||
* live. The size field in struct bkey records the current (live) size of the
|
||||
* extent, and is also used to mean "size of region on disk that we point to" in
|
||||
* this case.
|
||||
*
|
||||
* Thus an extent that is not checksummed or compressed will consist only of a
|
||||
* list of bch_extent_ptrs, with none of the fields in
|
||||
@ -446,11 +519,11 @@ struct bch_extent_crc128 {
|
||||
#elif defined (__BIG_ENDIAN_BITFIELD)
|
||||
__u64 compression_type:4,
|
||||
csum_type:4,
|
||||
nonce:14,
|
||||
nonce:13,
|
||||
offset:13,
|
||||
_uncompressed_size:13,
|
||||
_compressed_size:13,
|
||||
type:3;
|
||||
type:4;
|
||||
#endif
|
||||
struct bch_csum csum;
|
||||
} __attribute__((packed, aligned(8)));
|
||||
@ -496,7 +569,7 @@ struct bch_extent_reservation {
|
||||
};
|
||||
|
||||
union bch_extent_entry {
|
||||
#if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
|
||||
unsigned long type;
|
||||
#elif __BITS_PER_LONG == 32
|
||||
struct {
|
||||
@ -551,10 +624,11 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION);
|
||||
sizeof(struct bch_extent_ptr)) / sizeof(u64))
|
||||
|
||||
/* Maximum possible size of an entire extent value: */
|
||||
/* There's a hack in the keylist code that needs to be fixed.. */
|
||||
#define BKEY_EXTENT_VAL_U64s_MAX \
|
||||
(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
|
||||
|
||||
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
|
||||
|
||||
/* * Maximum possible size of an entire extent, key + value: */
|
||||
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
|
||||
|
||||
@ -1378,33 +1452,4 @@ struct btree_node_entry {
|
||||
};
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
/* Obsolete: */
|
||||
|
||||
struct prio_set {
|
||||
struct bch_csum csum;
|
||||
|
||||
__le64 magic;
|
||||
__le32 nonce[3];
|
||||
__le16 version;
|
||||
__le16 flags;
|
||||
|
||||
__u8 encrypted_start[0];
|
||||
|
||||
__le64 next_bucket;
|
||||
|
||||
struct bucket_disk {
|
||||
__le16 prio[2];
|
||||
__u8 gen;
|
||||
} __attribute__((packed)) data[];
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
|
||||
|
||||
#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
|
||||
|
||||
static inline __u64 __pset_magic(struct bch_sb *sb)
|
||||
{
|
||||
return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_FORMAT_H */
|
||||
|
@ -5,6 +5,9 @@
|
||||
#include <asm/ioctl.h>
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
/*
|
||||
* Flags common to multiple ioctls:
|
||||
*/
|
||||
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
|
||||
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
|
||||
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
|
||||
@ -14,12 +17,23 @@
|
||||
(BCH_FORCE_IF_DATA_DEGRADED| \
|
||||
BCH_FORCE_IF_METADATA_DEGRADED)
|
||||
|
||||
/*
|
||||
* If cleared, ioctl that refer to a device pass it as a pointer to a pathname
|
||||
* (e.g. /dev/sda1); if set, the dev field is the device's index within the
|
||||
* filesystem:
|
||||
*/
|
||||
#define BCH_BY_INDEX (1 << 4)
|
||||
|
||||
/*
|
||||
* For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
|
||||
* wide superblock:
|
||||
*/
|
||||
#define BCH_READ_DEV (1 << 5)
|
||||
|
||||
/* global control dev: */
|
||||
|
||||
/* These are currently broken, and probably unnecessary: */
|
||||
#if 0
|
||||
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
|
||||
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
|
||||
|
||||
@ -35,12 +49,18 @@ struct bch_ioctl_incremental {
|
||||
__u64 pad;
|
||||
__u64 dev;
|
||||
};
|
||||
#endif
|
||||
|
||||
/* filesystem ioctls: */
|
||||
|
||||
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
|
||||
|
||||
/* These only make sense when we also have incremental assembly */
|
||||
#if 0
|
||||
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
|
||||
#define BCH_IOCTL_STOP _IO(0xbc, 3)
|
||||
#endif
|
||||
|
||||
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
|
||||
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
|
||||
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
|
||||
@ -52,14 +72,70 @@ struct bch_ioctl_incremental {
|
||||
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
|
||||
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
|
||||
*
|
||||
* Returns user visible UUID, not internal UUID (which may not ever be changed);
|
||||
* the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
|
||||
* this UUID.
|
||||
*/
|
||||
struct bch_ioctl_query_uuid {
|
||||
uuid_le uuid;
|
||||
};
|
||||
|
||||
#if 0
|
||||
struct bch_ioctl_start {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
|
||||
*
|
||||
* The specified device must not be open or in use. On success, the new device
|
||||
* will be an online member of the filesystem just like any other member.
|
||||
*
|
||||
* The device must first be prepared by userspace by formatting with a bcachefs
|
||||
* superblock, which is only used for passing in superblock options/parameters
|
||||
* for that device (in struct bch_member). The new device's superblock should
|
||||
* not claim to be a member of any existing filesystem - UUIDs on it will be
|
||||
* ignored.
|
||||
*/
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
|
||||
*
|
||||
* Any data present on @dev will be permanently deleted, and @dev will be
|
||||
* removed from its slot in the filesystem's list of member devices. The device
|
||||
* may be either offline or offline.
|
||||
*
|
||||
* Will fail removing @dev would leave us with insufficient read write devices
|
||||
* or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
|
||||
* set.
|
||||
*/
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
|
||||
* but is not open (e.g. because we started in degraded mode), bring it online
|
||||
*
|
||||
* all existing data on @dev will be available once the device is online,
|
||||
* exactly as if @dev was present when the filesystem was first mounted
|
||||
*/
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
|
||||
* block device, without removing it from the filesystem (so it can be brought
|
||||
* back online later)
|
||||
*
|
||||
* Data present on @dev will be unavailable while @dev is offline (unless
|
||||
* replicated), but will still be intact and untouched if @dev is brought back
|
||||
* online
|
||||
*
|
||||
* Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
|
||||
* leave us with insufficient read write devices or degraded/unavailable data,
|
||||
* unless the approprate BCH_FORCE_IF_* flags are set.
|
||||
*/
|
||||
|
||||
struct bch_ioctl_disk {
|
||||
__u32 flags;
|
||||
@ -67,6 +143,16 @@ struct bch_ioctl_disk {
|
||||
__u64 dev;
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
|
||||
*
|
||||
* @new_state - one of the bch_member_state states (rw, ro, failed,
|
||||
* spare)
|
||||
*
|
||||
* Will refuse to change member state if we would then have insufficient devices
|
||||
* to write to, or if it would result in degraded data (when @new_state is
|
||||
* failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
|
||||
*/
|
||||
struct bch_ioctl_disk_set_state {
|
||||
__u32 flags;
|
||||
__u8 new_state;
|
||||
@ -81,6 +167,15 @@ enum bch_data_ops {
|
||||
BCH_DATA_OP_NR = 3,
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
|
||||
* scrub, rereplicate, migrate).
|
||||
*
|
||||
* This ioctl kicks off a job in the background, and returns a file descriptor.
|
||||
* Reading from the file descriptor returns a struct bch_ioctl_data_event,
|
||||
* indicating current progress, and closing the file descriptor will stop the
|
||||
* job. The file descriptor is O_CLOEXEC.
|
||||
*/
|
||||
struct bch_ioctl_data {
|
||||
__u32 op;
|
||||
__u32 flags;
|
||||
@ -93,9 +188,18 @@ struct bch_ioctl_data {
|
||||
__u32 dev;
|
||||
__u32 pad;
|
||||
} migrate;
|
||||
struct {
|
||||
__u64 pad[8];
|
||||
};
|
||||
};
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
enum bch_data_event {
|
||||
BCH_DATA_EVENT_PROGRESS = 0,
|
||||
/* XXX: add an event for reporting errors */
|
||||
BCH_DATA_EVENT_NR = 1,
|
||||
};
|
||||
|
||||
struct bch_ioctl_data_progress {
|
||||
__u8 data_type;
|
||||
__u8 btree_id;
|
||||
@ -106,6 +210,15 @@ struct bch_ioctl_data_progress {
|
||||
__u64 sectors_total;
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
struct bch_ioctl_data_event {
|
||||
__u8 type;
|
||||
__u8 pad[7];
|
||||
union {
|
||||
struct bch_ioctl_data_progress p;
|
||||
__u64 pad2[15];
|
||||
};
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
struct bch_ioctl_dev_usage {
|
||||
__u8 state;
|
||||
__u8 alive;
|
||||
@ -127,6 +240,19 @@ struct bch_ioctl_fs_usage {
|
||||
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_USAGE: query filesystem disk space usage
|
||||
*
|
||||
* Returns disk space usage broken out by data type, number of replicas, and
|
||||
* by component device
|
||||
*
|
||||
* @nr_devices - number of devices userspace allocated space for in @devs
|
||||
*
|
||||
* On success, @fs and @devs will be filled out appropriately and devs[i].alive
|
||||
* will indicate if a device was present in that slot
|
||||
*
|
||||
* Returns -ERANGE if @nr_devices was too small
|
||||
*/
|
||||
struct bch_ioctl_usage {
|
||||
__u16 nr_devices;
|
||||
__u16 pad[3];
|
||||
@ -135,6 +261,20 @@ struct bch_ioctl_usage {
|
||||
struct bch_ioctl_dev_usage devs[0];
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_READ_SUPER: read filesystem superblock
|
||||
*
|
||||
* Equivalent to reading the superblock directly from the block device, except
|
||||
* avoids racing with the kernel writing the superblock or having to figure out
|
||||
* which block device to read
|
||||
*
|
||||
* @sb - buffer to read into
|
||||
* @size - size of userspace allocated buffer
|
||||
* @dev - device to read superblock for, if BCH_READ_DEV flag is
|
||||
* specified
|
||||
*
|
||||
* Returns -ERANGE if buffer provided is too small
|
||||
*/
|
||||
struct bch_ioctl_read_super {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
@ -143,10 +283,22 @@ struct bch_ioctl_read_super {
|
||||
__u64 sb;
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
|
||||
* determine if disk is a (online) member - if so, returns device's index
|
||||
*
|
||||
* Returns -ENOENT if not found
|
||||
*/
|
||||
struct bch_ioctl_disk_get_idx {
|
||||
__u64 dev;
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
|
||||
*
|
||||
* @dev - member to resize
|
||||
* @nbuckets - new number of buckets
|
||||
*/
|
||||
struct bch_ioctl_disk_resize {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
|
@ -13,8 +13,6 @@
|
||||
|
||||
void bch2_to_binary(char *, const u64 *, unsigned);
|
||||
|
||||
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
|
||||
|
||||
/* bkey with split value, const */
|
||||
struct bkey_s_c {
|
||||
const struct bkey *k;
|
||||
@ -590,25 +588,31 @@ BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
|
||||
|
||||
/* byte order helpers */
|
||||
|
||||
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
|
||||
static inline unsigned high_word_offset(const struct bkey_format *f)
|
||||
{
|
||||
return f->key_u64s - 1;
|
||||
}
|
||||
|
||||
#define high_bit_offset 0
|
||||
#define nth_word(p, n) ((p) - (n))
|
||||
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
static inline unsigned high_word_offset(const struct bkey_format *f)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define high_bit_offset KEY_PACKED_BITS_START
|
||||
#define nth_word(p, n) ((p) + (n))
|
||||
|
||||
#else
|
||||
#error edit for your odd byteorder.
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
|
||||
#define high_bit_offset 0
|
||||
#define __high_word(u64s, k) ((k)->_data + (u64s) - 1)
|
||||
#define nth_word(p, n) ((p) - (n))
|
||||
|
||||
#else
|
||||
|
||||
#define high_bit_offset KEY_PACKED_BITS_START
|
||||
#define __high_word(u64s, k) ((k)->_data)
|
||||
#define nth_word(p, n) ((p) + (n))
|
||||
|
||||
#endif
|
||||
|
||||
#define high_word(format, k) __high_word((format)->key_u64s, k)
|
||||
#define high_word(f, k) ((k)->_data + high_word_offset(f))
|
||||
#define next_word(p) nth_word(p, 1)
|
||||
#define prev_word(p) nth_word(p, -1)
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_cache.h"
|
||||
#include "bset.h"
|
||||
#include "eytzinger.h"
|
||||
#include "util.h"
|
||||
@ -438,6 +439,10 @@ void bch2_btree_keys_free(struct btree *b)
|
||||
b->aux_data = NULL;
|
||||
}
|
||||
|
||||
#ifndef PAGE_KERNEL_EXEC
|
||||
# define PAGE_KERNEL_EXEC PAGE_KERNEL
|
||||
#endif
|
||||
|
||||
int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
|
||||
{
|
||||
b->page_order = page_order;
|
||||
@ -672,7 +677,7 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
|
||||
* (and then the bits we want are at the high end, so we shift them
|
||||
* back down):
|
||||
*/
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
v >>= f->exponent & 7;
|
||||
#else
|
||||
v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
|
||||
@ -761,7 +766,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
|
||||
* Then we calculate the actual shift value, from the start of the key
|
||||
* (k->_data), to get the key bits starting at exponent:
|
||||
*/
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
|
||||
|
||||
EBUG_ON(shift + bits > b->format.key_u64s * 64);
|
||||
@ -964,10 +969,14 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
|
||||
set_btree_bset(b, t, i);
|
||||
}
|
||||
|
||||
void bch2_bset_init_next(struct btree *b, struct bset *i)
|
||||
void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
|
||||
struct btree_node_entry *bne)
|
||||
{
|
||||
struct bset *i = &bne->keys;
|
||||
struct bset_tree *t;
|
||||
|
||||
BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
|
||||
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
|
||||
BUG_ON(b->nsets >= MAX_BSETS);
|
||||
|
||||
memset(i, 0, sizeof(*i));
|
||||
|
@ -157,9 +157,6 @@ static inline bool btree_keys_expensive_checks(const struct btree *b)
|
||||
#endif
|
||||
}
|
||||
|
||||
struct btree_node_iter;
|
||||
struct btree_node_iter_set;
|
||||
|
||||
enum bset_aux_tree_type {
|
||||
BSET_NO_AUX_TREE,
|
||||
BSET_RO_AUX_TREE,
|
||||
@ -342,7 +339,8 @@ int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
|
||||
void bch2_btree_keys_init(struct btree *, bool *);
|
||||
|
||||
void bch2_bset_init_first(struct btree *, struct bset *);
|
||||
void bch2_bset_init_next(struct btree *, struct bset *);
|
||||
void bch2_bset_init_next(struct bch_fs *, struct btree *,
|
||||
struct btree_node_entry *);
|
||||
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
|
||||
void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
|
||||
struct bkey_packed *);
|
||||
@ -420,14 +418,6 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
|
||||
|
||||
/* Btree key iteration */
|
||||
|
||||
struct btree_node_iter {
|
||||
u8 is_extents;
|
||||
|
||||
struct btree_node_iter_set {
|
||||
u16 k, end;
|
||||
} data[MAX_BSETS];
|
||||
};
|
||||
|
||||
static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
|
||||
bool is_extents)
|
||||
{
|
||||
|
@ -554,7 +554,8 @@ out:
|
||||
b->uncompacted_whiteout_u64s = 0;
|
||||
bch2_btree_keys_init(b, &c->expensive_debug_checks);
|
||||
|
||||
bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
||||
start_time);
|
||||
|
||||
return b;
|
||||
err:
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
struct range_checks {
|
||||
@ -264,10 +265,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
|
||||
|
||||
gc_pos_set(c, gc_pos_btree_node(b));
|
||||
|
||||
if (max_stale > 32)
|
||||
if (max_stale > 64)
|
||||
bch2_btree_node_rewrite(c, &iter,
|
||||
b->data->keys.seq,
|
||||
BTREE_INSERT_USE_RESERVE|
|
||||
BTREE_INSERT_NOWAIT|
|
||||
BTREE_INSERT_GC_LOCK_HELD);
|
||||
else if (!btree_gc_rewrite_disabled(c) &&
|
||||
(btree_gc_always_rewrite(c) || max_stale > 16))
|
||||
@ -557,7 +559,7 @@ void bch2_gc(struct bch_fs *c)
|
||||
out:
|
||||
up_write(&c->gc_lock);
|
||||
trace_gc_end(c);
|
||||
bch2_time_stats_update(&c->btree_gc_time, start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
|
||||
|
||||
/*
|
||||
* Wake up allocator in case it was waiting for buckets
|
||||
@ -813,6 +815,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct btree *b;
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
unsigned i;
|
||||
|
||||
/* Sliding window of adjacent btree nodes */
|
||||
@ -859,7 +862,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
|
||||
|
||||
lock_seq[0] = merge[0]->lock.state.seq;
|
||||
|
||||
if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
|
||||
if (kthread && kthread_should_stop()) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
return -ESHUTDOWN;
|
||||
}
|
||||
@ -958,13 +961,15 @@ static int bch2_gc_thread(void *arg)
|
||||
|
||||
void bch2_gc_thread_stop(struct bch_fs *c)
|
||||
{
|
||||
set_bit(BCH_FS_GC_STOPPING, &c->flags);
|
||||
|
||||
if (c->gc_thread)
|
||||
kthread_stop(c->gc_thread);
|
||||
struct task_struct *p;
|
||||
|
||||
p = c->gc_thread;
|
||||
c->gc_thread = NULL;
|
||||
clear_bit(BCH_FS_GC_STOPPING, &c->flags);
|
||||
|
||||
if (p) {
|
||||
kthread_stop(p);
|
||||
put_task_struct(p);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_gc_thread_start(struct bch_fs *c)
|
||||
@ -973,12 +978,13 @@ int bch2_gc_thread_start(struct bch_fs *c)
|
||||
|
||||
BUG_ON(c->gc_thread);
|
||||
|
||||
p = kthread_create(bch2_gc_thread, c, "bcache_gc");
|
||||
p = kthread_create(bch2_gc_thread, c, "bch_gc");
|
||||
if (IS_ERR(p))
|
||||
return PTR_ERR(p);
|
||||
|
||||
get_task_struct(p);
|
||||
c->gc_thread = p;
|
||||
wake_up_process(c->gc_thread);
|
||||
wake_up_process(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -627,7 +627,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
|
||||
BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
|
||||
|
||||
if (sorting_entire_node)
|
||||
bch2_time_stats_update(&c->btree_sort_time, start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
|
||||
start_time);
|
||||
|
||||
/* Make sure we preserve bset journal_seq: */
|
||||
for (t = b->set + start_idx; t < b->set + end_idx; t++)
|
||||
@ -801,7 +802,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
|
||||
&dst->format,
|
||||
true);
|
||||
|
||||
bch2_time_stats_update(&c->btree_sort_time, start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
|
||||
|
||||
set_btree_bset_end(dst, dst->set);
|
||||
|
||||
@ -877,7 +878,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
|
||||
|
||||
bne = want_new_bset(c, b);
|
||||
if (bne)
|
||||
bch2_bset_init_next(b, &bne->keys);
|
||||
bch2_bset_init_next(c, b, bne);
|
||||
|
||||
bch2_btree_build_aux_trees(b);
|
||||
|
||||
@ -1382,7 +1383,7 @@ start:
|
||||
}
|
||||
}
|
||||
|
||||
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
|
||||
bio_put(&rb->bio);
|
||||
clear_btree_node_read_in_flight(b);
|
||||
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
||||
@ -1742,6 +1743,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
|
||||
BUG_ON((b->will_make_reachable != 0) != !b->written);
|
||||
|
||||
BUG_ON(b->written >= c->opts.btree_node_size);
|
||||
BUG_ON(b->written & (c->opts.block_size - 1));
|
||||
BUG_ON(bset_written(b, btree_bset_last(b)));
|
||||
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
|
||||
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
|
||||
@ -1972,7 +1974,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
|
||||
|
||||
bne = want_new_bset(c, b);
|
||||
if (bne)
|
||||
bch2_bset_init_next(b, &bne->keys);
|
||||
bch2_bset_init_next(c, b, bne);
|
||||
|
||||
bch2_btree_build_aux_trees(b);
|
||||
|
||||
|
@ -133,7 +133,7 @@ do { \
|
||||
\
|
||||
six_unlock_read(&(_b)->lock); \
|
||||
btree_node_wait_on_io(_b); \
|
||||
six_lock_read(&(_b)->lock); \
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@ -42,25 +42,17 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
|
||||
six_unlock_write(&b->lock);
|
||||
}
|
||||
|
||||
void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
||||
void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
||||
{
|
||||
struct bch_fs *c = iter->c;
|
||||
struct btree_iter *linked;
|
||||
unsigned readers = 0;
|
||||
|
||||
EBUG_ON(iter->l[b->level].b != b);
|
||||
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
|
||||
|
||||
if (six_trylock_write(&b->lock))
|
||||
return;
|
||||
|
||||
for_each_linked_btree_iter(iter, linked)
|
||||
if (linked->l[b->level].b == b &&
|
||||
btree_node_read_locked(linked, b->level))
|
||||
readers++;
|
||||
|
||||
if (likely(!readers)) {
|
||||
six_lock_write(&b->lock);
|
||||
} else {
|
||||
/*
|
||||
* Must drop our read locks before calling six_lock_write() -
|
||||
* six_unlock() won't do wakeups until the reader count
|
||||
@ -69,11 +61,10 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
||||
*/
|
||||
atomic64_sub(__SIX_VAL(read_lock, readers),
|
||||
&b->lock.state.counter);
|
||||
six_lock_write(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_write);
|
||||
atomic64_add(__SIX_VAL(read_lock, readers),
|
||||
&b->lock.state.counter);
|
||||
}
|
||||
}
|
||||
|
||||
bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
|
||||
{
|
||||
@ -135,6 +126,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
|
||||
struct btree_iter *iter,
|
||||
enum six_lock_type type)
|
||||
{
|
||||
struct bch_fs *c = iter->c;
|
||||
struct btree_iter *linked;
|
||||
|
||||
/* Can't have children locked before ancestors: */
|
||||
@ -206,7 +198,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
|
||||
}
|
||||
}
|
||||
|
||||
six_lock_type(&b->lock, type);
|
||||
__btree_node_lock_type(c, b, type);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -4,72 +4,6 @@
|
||||
#include <linux/dynamic_fault.h>
|
||||
|
||||
#include "btree_types.h"
|
||||
#include "bset.h"
|
||||
|
||||
#define BTREE_ITER_SLOTS (1 << 0)
|
||||
#define BTREE_ITER_INTENT (1 << 1)
|
||||
#define BTREE_ITER_PREFETCH (1 << 2)
|
||||
/*
|
||||
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
|
||||
* @pos or the first key strictly greater than @pos
|
||||
*/
|
||||
#define BTREE_ITER_IS_EXTENTS (1 << 3)
|
||||
/*
|
||||
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
|
||||
*/
|
||||
#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
|
||||
#define BTREE_ITER_ERROR (1 << 5)
|
||||
|
||||
enum btree_iter_uptodate {
|
||||
BTREE_ITER_UPTODATE = 0,
|
||||
BTREE_ITER_NEED_PEEK = 1,
|
||||
BTREE_ITER_NEED_RELOCK = 2,
|
||||
BTREE_ITER_NEED_TRAVERSE = 3,
|
||||
BTREE_ITER_END = 4,
|
||||
};
|
||||
|
||||
/*
|
||||
* @pos - iterator's current position
|
||||
* @level - current btree depth
|
||||
* @locks_want - btree level below which we start taking intent locks
|
||||
* @nodes_locked - bitmask indicating which nodes in @nodes are locked
|
||||
* @nodes_intent_locked - bitmask indicating which locks are intent locks
|
||||
*/
|
||||
struct btree_iter {
|
||||
struct bch_fs *c;
|
||||
struct bpos pos;
|
||||
|
||||
u8 flags;
|
||||
unsigned uptodate:4;
|
||||
enum btree_id btree_id:4;
|
||||
unsigned level:4,
|
||||
locks_want:4,
|
||||
nodes_locked:4,
|
||||
nodes_intent_locked:4;
|
||||
|
||||
struct btree_iter_level {
|
||||
struct btree *b;
|
||||
struct btree_node_iter iter;
|
||||
} l[BTREE_MAX_DEPTH];
|
||||
|
||||
u32 lock_seq[BTREE_MAX_DEPTH];
|
||||
|
||||
/*
|
||||
* Current unpacked key - so that bch2_btree_iter_next()/
|
||||
* bch2_btree_iter_next_slot() can correctly advance pos.
|
||||
*/
|
||||
struct bkey k;
|
||||
|
||||
/*
|
||||
* Circular linked list of linked iterators: linked iterators share
|
||||
* locks (e.g. two linked iterators may have the same node intent
|
||||
* locked, or read and write locked, at the same time), and insertions
|
||||
* through one iterator won't invalidate the other linked iterators.
|
||||
*/
|
||||
|
||||
/* Must come last: */
|
||||
struct btree_iter *next;
|
||||
};
|
||||
|
||||
static inline void btree_iter_set_dirty(struct btree_iter *iter,
|
||||
enum btree_iter_uptodate u)
|
||||
|
@ -98,6 +98,39 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
|
||||
mark_btree_node_unlocked(iter, level);
|
||||
}
|
||||
|
||||
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case SIX_LOCK_read:
|
||||
return BCH_TIME_btree_lock_contended_read;
|
||||
case SIX_LOCK_intent:
|
||||
return BCH_TIME_btree_lock_contended_intent;
|
||||
case SIX_LOCK_write:
|
||||
return BCH_TIME_btree_lock_contended_write;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* wrapper around six locks that just traces lock contended time
|
||||
*/
|
||||
static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
|
||||
enum six_lock_type type)
|
||||
{
|
||||
u64 start_time = local_clock();
|
||||
|
||||
six_lock_type(&b->lock, type);
|
||||
bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
|
||||
}
|
||||
|
||||
static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
|
||||
enum six_lock_type type)
|
||||
{
|
||||
if (!six_trylock_type(&b->lock, type))
|
||||
__btree_node_lock_type(c, b, type);
|
||||
}
|
||||
|
||||
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
|
||||
struct btree_iter *, enum six_lock_type);
|
||||
|
||||
@ -125,7 +158,17 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
|
||||
bool bch2_btree_iter_relock(struct btree_iter *);
|
||||
|
||||
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
|
||||
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
|
||||
|
||||
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
|
||||
|
||||
static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
||||
{
|
||||
EBUG_ON(iter->l[b->level].b != b);
|
||||
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
|
||||
|
||||
if (!six_trylock_write(&b->lock))
|
||||
__bch2_btree_node_lock_write(b, iter);
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_LOCKING_H */
|
||||
|
||||
|
@ -176,6 +176,79 @@ struct btree_cache {
|
||||
struct closure_waitlist alloc_wait;
|
||||
};
|
||||
|
||||
struct btree_node_iter {
|
||||
u8 is_extents;
|
||||
|
||||
struct btree_node_iter_set {
|
||||
u16 k, end;
|
||||
} data[MAX_BSETS];
|
||||
};
|
||||
|
||||
#define BTREE_ITER_SLOTS (1 << 0)
|
||||
#define BTREE_ITER_INTENT (1 << 1)
|
||||
#define BTREE_ITER_PREFETCH (1 << 2)
|
||||
/*
|
||||
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
|
||||
* @pos or the first key strictly greater than @pos
|
||||
*/
|
||||
#define BTREE_ITER_IS_EXTENTS (1 << 3)
|
||||
/*
|
||||
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
|
||||
*/
|
||||
#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
|
||||
#define BTREE_ITER_ERROR (1 << 5)
|
||||
|
||||
enum btree_iter_uptodate {
|
||||
BTREE_ITER_UPTODATE = 0,
|
||||
BTREE_ITER_NEED_PEEK = 1,
|
||||
BTREE_ITER_NEED_RELOCK = 2,
|
||||
BTREE_ITER_NEED_TRAVERSE = 3,
|
||||
BTREE_ITER_END = 4,
|
||||
};
|
||||
|
||||
/*
|
||||
* @pos - iterator's current position
|
||||
* @level - current btree depth
|
||||
* @locks_want - btree level below which we start taking intent locks
|
||||
* @nodes_locked - bitmask indicating which nodes in @nodes are locked
|
||||
* @nodes_intent_locked - bitmask indicating which locks are intent locks
|
||||
*/
|
||||
struct btree_iter {
|
||||
struct bch_fs *c;
|
||||
struct bpos pos;
|
||||
|
||||
u8 flags;
|
||||
unsigned uptodate:4;
|
||||
enum btree_id btree_id:4;
|
||||
unsigned level:4,
|
||||
locks_want:4,
|
||||
nodes_locked:4,
|
||||
nodes_intent_locked:4;
|
||||
|
||||
struct btree_iter_level {
|
||||
struct btree *b;
|
||||
struct btree_node_iter iter;
|
||||
} l[BTREE_MAX_DEPTH];
|
||||
|
||||
u32 lock_seq[BTREE_MAX_DEPTH];
|
||||
|
||||
/*
|
||||
* Current unpacked key - so that bch2_btree_iter_next()/
|
||||
* bch2_btree_iter_next_slot() can correctly advance pos.
|
||||
*/
|
||||
struct bkey k;
|
||||
|
||||
/*
|
||||
* Circular linked list of linked iterators: linked iterators share
|
||||
* locks (e.g. two linked iterators may have the same node intent
|
||||
* locked, or read and write locked, at the same time), and insertions
|
||||
* through one iterator won't invalidate the other linked iterators.
|
||||
*/
|
||||
|
||||
/* Must come last: */
|
||||
struct btree_iter *next;
|
||||
};
|
||||
|
||||
#define BTREE_FLAG(flag) \
|
||||
static inline bool btree_node_ ## flag(struct btree *b) \
|
||||
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||
|
@ -237,7 +237,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
|
||||
|
||||
clear_btree_node_noevict(b);
|
||||
|
||||
six_lock_write(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_write);
|
||||
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
|
||||
@ -622,7 +622,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
|
||||
* b->will_make_reachable prevented it from being written, so
|
||||
* write it now if it needs to be written:
|
||||
*/
|
||||
six_lock_read(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
|
||||
six_unlock_read(&b->lock);
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
@ -647,8 +647,10 @@ static void btree_update_wait_on_journal(struct closure *cl)
|
||||
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
continue_at(cl, btree_update_wait_on_journal, system_wq);
|
||||
return;
|
||||
}
|
||||
|
||||
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
|
||||
err:
|
||||
@ -679,7 +681,7 @@ retry:
|
||||
|
||||
if (!six_trylock_read(&b->lock)) {
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
six_lock_read(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->lock);
|
||||
goto retry;
|
||||
}
|
||||
@ -720,7 +722,7 @@ retry:
|
||||
|
||||
if (!six_trylock_read(&b->lock)) {
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
six_lock_read(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->lock);
|
||||
goto retry;
|
||||
}
|
||||
@ -1456,7 +1458,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
|
||||
bch2_btree_iter_node_replace(iter, n2);
|
||||
bch2_btree_iter_node_replace(iter, n1);
|
||||
|
||||
bch2_time_stats_update(&c->btree_split_time, start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -1795,8 +1797,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
|
||||
bch2_btree_node_write(c, n, SIX_LOCK_intent);
|
||||
|
||||
if (parent) {
|
||||
bch2_btree_insert_node(as, parent, iter,
|
||||
&keylist_single(&n->key));
|
||||
bch2_keylist_add(&as->parent_keys, &n->key);
|
||||
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
|
||||
} else {
|
||||
bch2_btree_set_root(as, n, iter);
|
||||
}
|
||||
|
@ -226,11 +226,30 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
|
||||
return (void *) i > write_block(b);
|
||||
}
|
||||
|
||||
static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
|
||||
struct bset *i)
|
||||
static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
|
||||
struct btree *b,
|
||||
void *end)
|
||||
{
|
||||
return round_up(bset_byte_offset(b, vstruct_end(i)),
|
||||
block_bytes(c)) >> 9;
|
||||
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
|
||||
b->whiteout_u64s +
|
||||
b->uncompacted_whiteout_u64s;
|
||||
ssize_t total = c->opts.btree_node_size << 6;
|
||||
|
||||
return total - used;
|
||||
}
|
||||
|
||||
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
ssize_t remaining = __bch_btree_u64s_remaining(c, b,
|
||||
btree_bkey_last(b, bset_tree_last(b)));
|
||||
|
||||
BUG_ON(remaining < 0);
|
||||
|
||||
if (bset_written(b, btree_bset_last(b)))
|
||||
return 0;
|
||||
|
||||
return remaining;
|
||||
}
|
||||
|
||||
static inline unsigned btree_write_set_buffer(struct btree *b)
|
||||
@ -246,20 +265,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bset *i = btree_bset_last(b);
|
||||
unsigned offset = max_t(unsigned, b->written << 9,
|
||||
bset_byte_offset(b, vstruct_end(i)));
|
||||
ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t)
|
||||
(offset + sizeof(struct btree_node_entry) +
|
||||
b->whiteout_u64s * sizeof(u64) +
|
||||
b->uncompacted_whiteout_u64s * sizeof(u64));
|
||||
struct btree_node_entry *bne = max(write_block(b),
|
||||
(void *) btree_bkey_last(b, bset_tree_last(b)));
|
||||
ssize_t remaining_space =
|
||||
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
|
||||
|
||||
EBUG_ON(offset > btree_bytes(c));
|
||||
|
||||
if ((unlikely(bset_written(b, i)) &&
|
||||
remaining_space > block_bytes(c)) ||
|
||||
(unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
|
||||
remaining_space > btree_write_set_buffer(b)))
|
||||
return (void *) b->data + offset;
|
||||
if (unlikely(bset_written(b, i))) {
|
||||
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
|
||||
return bne;
|
||||
} else {
|
||||
if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
|
||||
remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
|
||||
return bne;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@ -285,23 +303,6 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bset *i = btree_bset_last(b);
|
||||
unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
|
||||
b->whiteout_u64s +
|
||||
b->uncompacted_whiteout_u64s;
|
||||
unsigned total = c->opts.btree_node_size << 6;
|
||||
|
||||
EBUG_ON(used > total);
|
||||
|
||||
if (bset_written(b, i))
|
||||
return 0;
|
||||
|
||||
return total - used;
|
||||
}
|
||||
|
||||
/*
|
||||
* write lock must be held on @b (else the dirty bset that we were going to
|
||||
* insert into could be written out from under us)
|
||||
|
@ -108,7 +108,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
|
||||
struct btree_write *w = container_of(pin, struct btree_write, journal);
|
||||
struct btree *b = container_of(w, struct btree, writes[i]);
|
||||
|
||||
six_lock_read(&b->lock);
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
bch2_btree_node_write_cond(c, b,
|
||||
(btree_current_write(b) == w &&
|
||||
w->journal.pin_list == journal_seq_pin(j, seq)));
|
||||
|
@ -555,9 +555,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
return;
|
||||
}
|
||||
|
||||
v = READ_ONCE(g->_mark.counter);
|
||||
v = atomic64_read(&g->_mark.v);
|
||||
do {
|
||||
new.counter = old.counter = v;
|
||||
new.v.counter = old.v.counter = v;
|
||||
saturated = 0;
|
||||
|
||||
/*
|
||||
@ -600,9 +600,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
|
||||
g->_mark = new;
|
||||
break;
|
||||
}
|
||||
} while ((v = cmpxchg(&g->_mark.counter,
|
||||
old.counter,
|
||||
new.counter)) != old.counter);
|
||||
} while ((v = atomic64_cmpxchg(&g->_mark.v,
|
||||
old.v.counter,
|
||||
new.v.counter)) != old.v.counter);
|
||||
|
||||
bch2_dev_usage_update(c, ca, old, new);
|
||||
|
||||
@ -957,7 +957,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
|
||||
kvpfree(ca->buckets_dirty,
|
||||
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
|
||||
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
|
||||
kvpfree(ca->buckets, sizeof(struct bucket_array) +
|
||||
kvpfree(rcu_dereference_protected(ca->buckets, 1),
|
||||
sizeof(struct bucket_array) +
|
||||
ca->mi.nbuckets * sizeof(struct bucket));
|
||||
|
||||
free_percpu(ca->usage_percpu);
|
||||
|
@ -16,15 +16,15 @@
|
||||
|
||||
#define bucket_cmpxchg(g, new, expr) \
|
||||
({ \
|
||||
u64 _v = READ_ONCE((g)->_mark.counter); \
|
||||
u64 _v = atomic64_read(&(g)->_mark.v); \
|
||||
struct bucket_mark _old; \
|
||||
\
|
||||
do { \
|
||||
(new).counter = _old.counter = _v; \
|
||||
(new).v.counter = _old.v.counter = _v; \
|
||||
expr; \
|
||||
} while ((_v = cmpxchg(&(g)->_mark.counter, \
|
||||
_old.counter, \
|
||||
(new).counter)) != _old.counter);\
|
||||
} while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
|
||||
_old.v.counter, \
|
||||
(new).v.counter)) != _old.v.counter);\
|
||||
_old; \
|
||||
})
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
struct bucket_mark {
|
||||
union {
|
||||
struct {
|
||||
u64 counter;
|
||||
atomic64_t v;
|
||||
};
|
||||
|
||||
struct {
|
||||
|
@ -54,6 +54,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
|
||||
return ca;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_assemble arg;
|
||||
@ -127,14 +128,17 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
#if 0
|
||||
case BCH_IOCTL_ASSEMBLE:
|
||||
return bch2_ioctl_assemble(arg);
|
||||
case BCH_IOCTL_INCREMENTAL:
|
||||
return bch2_ioctl_incremental(arg);
|
||||
#endif
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
@ -148,6 +152,7 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
|
||||
sizeof(c->sb.user_uuid));
|
||||
}
|
||||
|
||||
#if 0
|
||||
static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
|
||||
{
|
||||
if (arg.flags || arg.pad)
|
||||
@ -161,6 +166,7 @@ static long bch2_ioctl_stop(struct bch_fs *c)
|
||||
bch2_fs_stop(c);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
|
||||
{
|
||||
@ -294,18 +300,19 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
|
||||
{
|
||||
struct bch_data_ctx *ctx = file->private_data;
|
||||
struct bch_fs *c = ctx->c;
|
||||
struct bch_ioctl_data_progress p = {
|
||||
.data_type = ctx->stats.data_type,
|
||||
.btree_id = ctx->stats.iter.btree_id,
|
||||
.pos = ctx->stats.iter.pos,
|
||||
.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
||||
.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
|
||||
struct bch_ioctl_data_event e = {
|
||||
.type = BCH_DATA_EVENT_PROGRESS,
|
||||
.p.data_type = ctx->stats.data_type,
|
||||
.p.btree_id = ctx->stats.iter.btree_id,
|
||||
.p.pos = ctx->stats.iter.pos,
|
||||
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
||||
.p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
|
||||
};
|
||||
|
||||
if (len != sizeof(p))
|
||||
if (len < sizeof(e))
|
||||
return -EINVAL;
|
||||
|
||||
return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p);
|
||||
return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
|
||||
}
|
||||
|
||||
static const struct file_operations bcachefs_data_ops = {
|
||||
@ -419,7 +426,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
|
||||
|
||||
if (ca->dev_idx >= arg.nr_devices) {
|
||||
percpu_ref_put(&ca->ref);
|
||||
return -ENOSPC;
|
||||
return -ERANGE;
|
||||
}
|
||||
|
||||
if (percpu_ref_tryget(&ca->io_ref)) {
|
||||
@ -539,10 +546,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
||||
return -EPERM;
|
||||
|
||||
switch (cmd) {
|
||||
#if 0
|
||||
case BCH_IOCTL_START:
|
||||
BCH_IOCTL(start, struct bch_ioctl_start);
|
||||
case BCH_IOCTL_STOP:
|
||||
return bch2_ioctl_stop(c);
|
||||
#endif
|
||||
case BCH_IOCTL_READ_SUPER:
|
||||
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
|
||||
case BCH_IOCTL_DISK_GET_IDX:
|
||||
|
@ -421,7 +421,7 @@ static struct bch_csum bch2_checksum_merge(unsigned type,
|
||||
BUG_ON(!bch2_checksum_mergeable(type));
|
||||
|
||||
while (b_len) {
|
||||
unsigned b = min(b_len, PAGE_SIZE);
|
||||
unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
|
||||
|
||||
a.lo = bch2_checksum_update(type, a.lo,
|
||||
page_address(ZERO_PAGE(0)), b);
|
||||
|
@ -42,7 +42,8 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
|
||||
}
|
||||
|
||||
struct io_clock_wait {
|
||||
struct io_timer timer;
|
||||
struct io_timer io_timer;
|
||||
struct timer_list cpu_timer;
|
||||
struct task_struct *task;
|
||||
int expired;
|
||||
};
|
||||
@ -50,7 +51,16 @@ struct io_clock_wait {
|
||||
static void io_clock_wait_fn(struct io_timer *timer)
|
||||
{
|
||||
struct io_clock_wait *wait = container_of(timer,
|
||||
struct io_clock_wait, timer);
|
||||
struct io_clock_wait, io_timer);
|
||||
|
||||
wait->expired = 1;
|
||||
wake_up_process(wait->task);
|
||||
}
|
||||
|
||||
static void io_clock_cpu_timeout(struct timer_list *timer)
|
||||
{
|
||||
struct io_clock_wait *wait = container_of(timer,
|
||||
struct io_clock_wait, cpu_timer);
|
||||
|
||||
wait->expired = 1;
|
||||
wake_up_process(wait->task);
|
||||
@ -61,35 +71,38 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
|
||||
struct io_clock_wait wait;
|
||||
|
||||
/* XXX: calculate sleep time rigorously */
|
||||
wait.timer.expire = until;
|
||||
wait.timer.fn = io_clock_wait_fn;
|
||||
wait.io_timer.expire = until;
|
||||
wait.io_timer.fn = io_clock_wait_fn;
|
||||
wait.task = current;
|
||||
wait.expired = 0;
|
||||
bch2_io_timer_add(clock, &wait.timer);
|
||||
bch2_io_timer_add(clock, &wait.io_timer);
|
||||
|
||||
schedule();
|
||||
|
||||
bch2_io_timer_del(clock, &wait.timer);
|
||||
bch2_io_timer_del(clock, &wait.io_timer);
|
||||
}
|
||||
|
||||
/*
|
||||
* _only_ to be used from a kthread
|
||||
*/
|
||||
void bch2_kthread_io_clock_wait(struct io_clock *clock,
|
||||
unsigned long until)
|
||||
unsigned long io_until,
|
||||
unsigned long cpu_timeout)
|
||||
{
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
struct io_clock_wait wait;
|
||||
|
||||
/* XXX: calculate sleep time rigorously */
|
||||
wait.timer.expire = until;
|
||||
wait.timer.fn = io_clock_wait_fn;
|
||||
wait.io_timer.expire = io_until;
|
||||
wait.io_timer.fn = io_clock_wait_fn;
|
||||
wait.task = current;
|
||||
wait.expired = 0;
|
||||
bch2_io_timer_add(clock, &wait.timer);
|
||||
bch2_io_timer_add(clock, &wait.io_timer);
|
||||
|
||||
timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
|
||||
|
||||
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
|
||||
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
|
||||
|
||||
while (1) {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
if (kthread_should_stop())
|
||||
if (kthread && kthread_should_stop())
|
||||
break;
|
||||
|
||||
if (wait.expired)
|
||||
@ -100,7 +113,9 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
|
||||
}
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
bch2_io_timer_del(clock, &wait.timer);
|
||||
del_singleshot_timer_sync(&wait.cpu_timer);
|
||||
destroy_timer_on_stack(&wait.cpu_timer);
|
||||
bch2_io_timer_del(clock, &wait.io_timer);
|
||||
}
|
||||
|
||||
static struct io_timer *get_expired_timer(struct io_clock *clock,
|
||||
|
@ -3,7 +3,8 @@
|
||||
|
||||
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
|
||||
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
|
||||
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long);
|
||||
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
|
||||
unsigned long);
|
||||
void bch2_increment_clock(struct bch_fs *, unsigned, int);
|
||||
|
||||
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
|
||||
|
@ -480,7 +480,7 @@ static const unsigned bch2_compression_opt_to_feature[] = {
|
||||
|
||||
#undef BCH_FEATURE_NONE
|
||||
|
||||
int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
|
||||
static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
@ -529,26 +529,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
|
||||
mempool_exit(&c->compression_bounce[READ]);
|
||||
}
|
||||
|
||||
static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
|
||||
{
|
||||
size_t size = (size_t)pool_data;
|
||||
return kvpmalloc(size, gfp_mask);
|
||||
}
|
||||
|
||||
void mempool_kvpfree(void *element, void *pool_data)
|
||||
{
|
||||
size_t size = (size_t)pool_data;
|
||||
kvpfree(element, size);
|
||||
}
|
||||
|
||||
static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
|
||||
{
|
||||
return !mempool_initialized(pool)
|
||||
? mempool_init(pool, min_nr, mempool_kvpmalloc,
|
||||
mempool_kvpfree, (void *) size)
|
||||
: 0;
|
||||
}
|
||||
|
||||
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
||||
{
|
||||
size_t max_extent = c->sb.encoded_extent_max << 9;
|
||||
@ -611,6 +591,9 @@ have_compressed:
|
||||
if (i->decompress_workspace)
|
||||
decompress_workspace_needed = true;
|
||||
|
||||
if (mempool_initialized(&c->compress_workspace[i->type]))
|
||||
continue;
|
||||
|
||||
ret = mempool_init_kvpmalloc_pool(
|
||||
&c->compress_workspace[i->type],
|
||||
1, i->compress_workspace);
|
||||
|
@ -16,7 +16,7 @@ static int group_cmp(const void *_l, const void *_r)
|
||||
strncmp(l->label, r->label, sizeof(l->label));
|
||||
}
|
||||
|
||||
const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
|
||||
static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_disk_groups *groups =
|
||||
@ -162,7 +162,8 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
|
||||
}
|
||||
}
|
||||
|
||||
old_g = c->disk_groups;
|
||||
old_g = rcu_dereference_protected(c->disk_groups,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
rcu_assign_pointer(c->disk_groups, cpu_g);
|
||||
if (old_g)
|
||||
kfree_rcu(old_g, rcu);
|
||||
@ -193,6 +194,36 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
|
||||
}
|
||||
}
|
||||
|
||||
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
|
||||
{
|
||||
struct target t = target_decode(target);
|
||||
|
||||
switch (t.type) {
|
||||
case TARGET_NULL:
|
||||
return false;
|
||||
case TARGET_DEV:
|
||||
return dev == t.dev;
|
||||
case TARGET_GROUP: {
|
||||
struct bch_disk_groups_cpu *g;
|
||||
const struct bch_devs_mask *m;
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
g = rcu_dereference(c->disk_groups);
|
||||
m = t.group < g->nr && !g->entries[t.group].deleted
|
||||
? &g->entries[t.group].devs
|
||||
: NULL;
|
||||
|
||||
ret = m ? test_bit(dev, m->d) : false;
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
|
||||
unsigned parent,
|
||||
const char *name, unsigned namelen)
|
||||
|
@ -53,34 +53,8 @@ static inline struct target target_decode(unsigned target)
|
||||
return (struct target) { .type = TARGET_NULL };
|
||||
}
|
||||
|
||||
static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
|
||||
{
|
||||
struct target t = target_decode(target);
|
||||
|
||||
switch (t.type) {
|
||||
case TARGET_NULL:
|
||||
return false;
|
||||
case TARGET_DEV:
|
||||
return ca->dev_idx == t.dev;
|
||||
case TARGET_GROUP:
|
||||
return ca->mi.group && ca->mi.group - 1 == t.group;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
|
||||
bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
|
||||
|
||||
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
|
||||
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
|
||||
|
@ -144,7 +144,7 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
|
||||
const struct bch_extent_ptr *ptr;
|
||||
|
||||
extent_for_each_ptr(e, ptr) {
|
||||
struct bch_dev *ca = c->devs[ptr->dev];
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (ca->mi.group &&
|
||||
ca->mi.group - 1 == group)
|
||||
@ -159,13 +159,11 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
|
||||
extent_for_each_ptr(e, ptr) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (dev_in_target(ca, target) &&
|
||||
(!ptr->cached || !ptr_stale(ca, ptr)))
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (bch2_dev_in_target(c, ptr->dev, target) &&
|
||||
(!ptr->cached ||
|
||||
!ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
|
||||
return ptr;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@ -732,7 +730,7 @@ err:
|
||||
bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
|
||||
"gen %i mark %08x",
|
||||
err, buf, PTR_BUCKET_NR(ca, ptr),
|
||||
mark.gen, (unsigned) mark.counter);
|
||||
mark.gen, (unsigned) mark.v.counter);
|
||||
}
|
||||
|
||||
void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
|
||||
@ -2024,7 +2022,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
|
||||
int n = bch2_extent_ptr_durability(c, ptr);
|
||||
|
||||
if (n && n <= extra &&
|
||||
!dev_in_target(c->devs[ptr->dev], target)) {
|
||||
!bch2_dev_in_target(c, ptr->dev, target)) {
|
||||
ptr->cached = true;
|
||||
extra -= n;
|
||||
}
|
||||
|
@ -278,24 +278,38 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
|
||||
.uncompressed_size = k->size,
|
||||
.live_size = k->size,
|
||||
};
|
||||
case BCH_EXTENT_CRC32:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
case BCH_EXTENT_CRC32: {
|
||||
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc32),
|
||||
.csum.lo = (__force __le64) crc->crc32.csum,
|
||||
};
|
||||
case BCH_EXTENT_CRC64:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
|
||||
*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
|
||||
|
||||
memcpy(&ret.csum.lo, &crc->crc32.csum,
|
||||
sizeof(crc->crc32.csum));
|
||||
|
||||
return ret;
|
||||
}
|
||||
case BCH_EXTENT_CRC64: {
|
||||
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc64),
|
||||
.nonce = crc->crc64.nonce,
|
||||
.csum.lo = (__force __le64) crc->crc64.csum_lo,
|
||||
.csum.hi = (__force __le64) crc->crc64.csum_hi,
|
||||
};
|
||||
case BCH_EXTENT_CRC128:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
|
||||
*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
|
||||
|
||||
return ret;
|
||||
}
|
||||
case BCH_EXTENT_CRC128: {
|
||||
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc128),
|
||||
.nonce = crc->crc128.nonce,
|
||||
.csum = crc->crc128.csum,
|
||||
};
|
||||
|
||||
return ret;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
@ -678,7 +678,7 @@ static void bch2_clear_page_bits(struct page *page)
|
||||
if (!PagePrivate(page))
|
||||
return;
|
||||
|
||||
s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
|
||||
s.v = xchg(&page_state(page)->v, 0);
|
||||
ClearPagePrivate(page);
|
||||
|
||||
if (s.dirty_sectors)
|
||||
@ -1020,12 +1020,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
|
||||
|
||||
if (bkey_extent_is_data(k.k)) {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
const union bch_extent_entry *i;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
want_full_extent |= !!crc.csum_type |
|
||||
!!crc.compression_type;
|
||||
extent_for_each_crc(e, crc, i)
|
||||
want_full_extent |= ((crc.csum_type != 0) |
|
||||
(crc.compression_type != 0));
|
||||
}
|
||||
|
||||
readpage_bio_extend(readpages_iter,
|
||||
@ -1850,8 +1850,7 @@ err_wait_io:
|
||||
dio->loop = true;
|
||||
|
||||
if (!dio->sync) {
|
||||
continue_at_noreturn(&dio->cl,
|
||||
bch2_dio_write_loop_async, NULL);
|
||||
continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
|
||||
return -EIOCBQUEUED;
|
||||
}
|
||||
|
||||
|
@ -610,7 +610,8 @@ static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
|
||||
static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
|
||||
{
|
||||
if (nr >= b->size) {
|
||||
size_t new_size = max(max(PAGE_SIZE * 8,
|
||||
size_t new_size = max_t(size_t, max_t(size_t,
|
||||
PAGE_SIZE * 8,
|
||||
b->size * 2),
|
||||
nr + 1);
|
||||
void *n;
|
||||
@ -642,7 +643,7 @@ struct pathbuf {
|
||||
static int path_down(struct pathbuf *p, u64 inum)
|
||||
{
|
||||
if (p->nr == p->size) {
|
||||
size_t new_size = max(256UL, p->size * 2);
|
||||
size_t new_size = max_t(size_t, 256UL, p->size * 2);
|
||||
void *n = krealloc(p->entries,
|
||||
new_size * sizeof(p->entries[0]),
|
||||
GFP_KERNEL);
|
||||
|
@ -21,10 +21,10 @@
|
||||
#include "journal.h"
|
||||
#include "keylist.h"
|
||||
#include "move.h"
|
||||
#include "rebalance.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
#include "tier.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/random.h>
|
||||
@ -269,7 +269,7 @@ static void bch2_write_done(struct closure *cl)
|
||||
percpu_ref_put(&c->writes);
|
||||
bch2_keylist_free(&op->insert_keys, op->inline_keys);
|
||||
|
||||
bch2_time_stats_update(&c->data_write_time, op->start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
|
||||
|
||||
closure_return(cl);
|
||||
}
|
||||
@ -842,20 +842,24 @@ again:
|
||||
} while (ret);
|
||||
|
||||
continue_at(cl, bch2_write_index, index_update_wq(op));
|
||||
return;
|
||||
err:
|
||||
op->error = ret;
|
||||
|
||||
continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
|
||||
? bch2_write_index
|
||||
: bch2_write_done, index_update_wq(op));
|
||||
return;
|
||||
flush_io:
|
||||
closure_sync(cl);
|
||||
|
||||
if (!bch2_keylist_empty(&op->insert_keys)) {
|
||||
__bch2_write_index(op);
|
||||
|
||||
if (op->error)
|
||||
if (op->error) {
|
||||
continue_at_nobarrier(cl, bch2_write_done, NULL);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
goto again;
|
||||
@ -901,6 +905,7 @@ void bch2_write(struct closure *cl)
|
||||
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
|
||||
bch2_disk_reservation_put(c, &op->res);
|
||||
closure_return(cl);
|
||||
return;
|
||||
}
|
||||
|
||||
bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
|
||||
@ -974,7 +979,8 @@ static void promote_done(struct closure *cl)
|
||||
container_of(cl, struct promote_op, cl);
|
||||
struct bch_fs *c = op->write.op.c;
|
||||
|
||||
bch2_time_stats_update(&c->data_promote_time, op->start_time);
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
|
||||
op->start_time);
|
||||
|
||||
bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
|
||||
promote_free(c, op);
|
||||
@ -1048,7 +1054,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
|
||||
(*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
|
||||
bch2_bio_map(&(*rbio)->bio, NULL);
|
||||
|
||||
if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
|
||||
if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
|
||||
goto err;
|
||||
|
||||
(*rbio)->bounce = true;
|
||||
@ -1174,7 +1180,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
|
||||
|
||||
static void bch2_rbio_done(struct bch_read_bio *rbio)
|
||||
{
|
||||
bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
|
||||
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
|
||||
rbio->start_time);
|
||||
bio_endio(&rbio->bio);
|
||||
}
|
||||
|
||||
@ -1486,7 +1493,7 @@ csum_err:
|
||||
}
|
||||
|
||||
bch2_dev_io_error(ca,
|
||||
"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
|
||||
"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
|
||||
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
|
||||
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
|
||||
csum.hi, csum.lo, crc.csum_type);
|
||||
|
@ -365,6 +365,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
|
||||
ssize_t bch2_journal_print_debug(struct journal *, char *);
|
||||
ssize_t bch2_journal_print_pins(struct journal *, char *);
|
||||
|
||||
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
|
||||
unsigned nr);
|
||||
int bch2_dev_journal_alloc(struct bch_dev *);
|
||||
|
||||
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
|
||||
|
@ -324,7 +324,7 @@ struct jset_entry_ops {
|
||||
struct jset_entry *, int);
|
||||
};
|
||||
|
||||
const struct jset_entry_ops bch2_jset_entry_ops[] = {
|
||||
static const struct jset_entry_ops bch2_jset_entry_ops[] = {
|
||||
#define x(f, nr) \
|
||||
[BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
|
||||
.validate = journal_entry_validate_##f, \
|
||||
@ -696,6 +696,7 @@ out:
|
||||
kvpfree(buf.data, buf.size);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
closure_return(cl);
|
||||
return;
|
||||
err:
|
||||
mutex_lock(&jlist->lock);
|
||||
jlist->ret = ret;
|
||||
@ -716,19 +717,6 @@ void bch2_journal_entries_free(struct list_head *list)
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool journal_has_keys(struct list_head *list)
|
||||
{
|
||||
struct journal_replay *i;
|
||||
struct jset_entry *entry;
|
||||
struct bkey_i *k, *_n;
|
||||
|
||||
list_for_each_entry(i, list, list)
|
||||
for_each_jset_key(k, _n, entry, &i->j)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
{
|
||||
struct journal *j = &c->journal;
|
||||
@ -737,8 +725,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
struct journal_entry_pin_list *p;
|
||||
struct bch_dev *ca;
|
||||
u64 cur_seq, end_seq, seq;
|
||||
unsigned iter, keys = 0, entries = 0;
|
||||
size_t nr;
|
||||
unsigned iter;
|
||||
size_t entries = 0;
|
||||
u64 nr, keys = 0;
|
||||
bool degraded = false;
|
||||
int ret = 0;
|
||||
|
||||
@ -772,9 +761,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
return BCH_FSCK_REPAIR_IMPOSSIBLE;
|
||||
}
|
||||
|
||||
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
|
||||
"filesystem marked clean but journal has keys to replay");
|
||||
|
||||
list_for_each_entry(i, list, list) {
|
||||
ret = jset_validate_entries(c, &i->j, READ);
|
||||
if (ret)
|
||||
@ -797,15 +783,27 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
}
|
||||
}
|
||||
|
||||
list_for_each_entry(i, list, list) {
|
||||
struct jset_entry *entry;
|
||||
struct bkey_i *k, *_n;
|
||||
|
||||
for_each_jset_key(k, _n, entry, &i->j)
|
||||
keys++;
|
||||
}
|
||||
|
||||
i = list_last_entry(list, struct journal_replay, list);
|
||||
|
||||
nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
|
||||
|
||||
fsck_err_on(c->sb.clean && (keys || nr > 1), c,
|
||||
"filesystem marked clean but journal not empty (%llu keys in %llu entries)",
|
||||
keys, nr);
|
||||
|
||||
if (nr > j->pin.size) {
|
||||
free_fifo(&j->pin);
|
||||
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
|
||||
if (!j->pin.data) {
|
||||
bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
|
||||
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
@ -844,8 +842,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
struct journal_replay, list)->j.seq);
|
||||
|
||||
list_for_each_entry(i, list, list) {
|
||||
struct jset_entry *entry;
|
||||
struct bkey_i *k, *_n;
|
||||
bool blacklisted;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
@ -867,13 +863,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
|
||||
journal_last_seq(j), end_seq);
|
||||
|
||||
cur_seq = le64_to_cpu(i->j.seq) + 1;
|
||||
|
||||
for_each_jset_key(k, _n, entry, &i->j)
|
||||
keys++;
|
||||
entries++;
|
||||
}
|
||||
|
||||
bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
|
||||
bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
|
||||
keys, entries, journal_cur_seq(j));
|
||||
fsck_err:
|
||||
return ret;
|
||||
@ -1361,6 +1354,7 @@ void bch2_journal_write(struct closure *cl)
|
||||
bch_err(c, "Unable to allocate journal write");
|
||||
bch2_fatal_error(c);
|
||||
continue_at(cl, journal_write_done, system_highpri_wq);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1417,6 +1411,7 @@ no_io:
|
||||
ptr->offset += sectors;
|
||||
|
||||
continue_at(cl, journal_write_done, system_highpri_wq);
|
||||
return;
|
||||
err:
|
||||
bch2_inconsistent_error(c);
|
||||
continue_at(cl, journal_write_done, system_highpri_wq);
|
||||
|
@ -247,7 +247,7 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
|
||||
if (!bl->nr_entries ||
|
||||
is_power_of_2(bl->nr_entries)) {
|
||||
n = krealloc(bl->entries,
|
||||
max(bl->nr_entries * 2, 8UL) * sizeof(*n),
|
||||
max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
|
||||
GFP_KERNEL);
|
||||
if (!n) {
|
||||
ret = -ENOMEM;
|
||||
|
@ -55,9 +55,6 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
|
||||
_k != (_keylist)->top; \
|
||||
_k = bkey_next(_k))
|
||||
|
||||
#define keylist_single(k) \
|
||||
((struct keylist) { .keys = k, .top = bkey_next(k) })
|
||||
|
||||
static inline u64 keylist_sectors(struct keylist *keys)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
@ -306,8 +306,11 @@ static void move_write(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
|
||||
if (likely(!io->rbio.bio.bi_status &&
|
||||
!io->rbio.hole)) {
|
||||
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
||||
closure_return_with_destructor(cl, move_free);
|
||||
return;
|
||||
}
|
||||
|
||||
bch2_migrate_read_done(&io->write, &io->rbio);
|
||||
|
||||
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
||||
@ -315,9 +318,6 @@ static void move_write(struct closure *cl)
|
||||
continue_at(cl, move_write_done, NULL);
|
||||
}
|
||||
|
||||
closure_return_with_destructor(cl, move_free);
|
||||
}
|
||||
|
||||
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
|
||||
{
|
||||
struct moving_io *io =
|
||||
@ -411,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
|
||||
|
||||
bch2_bio_map(&io->write.op.wbio.bio, NULL);
|
||||
if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
|
||||
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
|
||||
goto err_free;
|
||||
|
||||
io->rbio.opts = io_opts;
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "btree_iter.h"
|
||||
#include "buckets.h"
|
||||
#include "io_types.h"
|
||||
#include "move_types.h"
|
||||
|
||||
struct bch_read_bio;
|
||||
struct moving_context;
|
||||
@ -48,16 +49,6 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
|
||||
enum bkey_type, struct bkey_s_c_extent,
|
||||
struct bch_io_opts *, struct data_opts *);
|
||||
|
||||
struct bch_move_stats {
|
||||
enum bch_data_type data_type;
|
||||
struct btree_iter iter;
|
||||
|
||||
atomic64_t keys_moved;
|
||||
atomic64_t sectors_moved;
|
||||
atomic64_t sectors_seen;
|
||||
atomic64_t sectors_raced;
|
||||
};
|
||||
|
||||
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
|
||||
struct write_point_specifier,
|
||||
struct bpos, struct bpos,
|
||||
|
14
libbcachefs/move_types.h
Normal file
14
libbcachefs/move_types.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef _BCACHEFS_MOVE_TYPES_H
|
||||
#define _BCACHEFS_MOVE_TYPES_H
|
||||
|
||||
struct bch_move_stats {
|
||||
enum bch_data_type data_type;
|
||||
struct btree_iter iter;
|
||||
|
||||
atomic64_t keys_moved;
|
||||
atomic64_t sectors_moved;
|
||||
atomic64_t sectors_seen;
|
||||
atomic64_t sectors_raced;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_MOVE_TYPES_H */
|
@ -241,7 +241,8 @@ static int bch2_copygc_thread(void *arg)
|
||||
ca->mi.bucket_size;
|
||||
if (available > reserve) {
|
||||
next = last + available - reserve;
|
||||
bch2_kthread_io_clock_wait(clock, next);
|
||||
bch2_kthread_io_clock_wait(clock, next,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -252,7 +253,8 @@ static int bch2_copygc_thread(void *arg)
|
||||
fragmented = usage.sectors_fragmented;
|
||||
if (fragmented < reserve) {
|
||||
next = last + reserve - fragmented;
|
||||
bch2_kthread_io_clock_wait(clock, next);
|
||||
bch2_kthread_io_clock_wait(clock, next,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
341
libbcachefs/rebalance.c
Normal file
341
libbcachefs/rebalance.c
Normal file
@ -0,0 +1,341 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc.h"
|
||||
#include "btree_iter.h"
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "disk_groups.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "move.h"
|
||||
#include "rebalance.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
static inline bool rebalance_ptr_pred(struct bch_fs *c,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
struct bch_extent_crc_unpacked crc,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
if (io_opts->background_target &&
|
||||
!bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
|
||||
!ptr->cached)
|
||||
return true;
|
||||
|
||||
if (io_opts->background_compression &&
|
||||
crc.compression_type !=
|
||||
bch2_compression_opt_to_type[io_opts->background_compression])
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_key(struct bch_fs *c,
|
||||
struct bkey_s_c k,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bkey_s_c_extent e;
|
||||
|
||||
if (!bkey_extent_is_data(k.k))
|
||||
return;
|
||||
|
||||
if (!io_opts->background_target &&
|
||||
!io_opts->background_compression)
|
||||
return;
|
||||
|
||||
e = bkey_s_c_to_extent(k);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (atomic64_add_return(crc.compressed_size,
|
||||
&ca->rebalance_work) ==
|
||||
crc.compressed_size)
|
||||
rebalance_wakeup(c);
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
|
||||
{
|
||||
if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
|
||||
sectors)
|
||||
rebalance_wakeup(c);
|
||||
}
|
||||
|
||||
static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
/* Make sure we have room to add a new pointer: */
|
||||
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
|
||||
BKEY_EXTENT_VAL_U64s_MAX)
|
||||
return DATA_SKIP;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (rebalance_ptr_pred(c, ptr, crc, io_opts))
|
||||
goto found;
|
||||
|
||||
return DATA_SKIP;
|
||||
found:
|
||||
data_opts->target = io_opts->background_target;
|
||||
data_opts->btree_insert_flags = 0;
|
||||
return DATA_ADD_REPLICAS;
|
||||
}
|
||||
|
||||
struct rebalance_work {
|
||||
int dev_most_full_idx;
|
||||
unsigned dev_most_full_percent;
|
||||
u64 dev_most_full_work;
|
||||
u64 dev_most_full_capacity;
|
||||
u64 total_work;
|
||||
};
|
||||
|
||||
static void rebalance_work_accumulate(struct rebalance_work *w,
|
||||
u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
|
||||
{
|
||||
unsigned percent_full;
|
||||
u64 work = dev_work + unknown_dev;
|
||||
|
||||
if (work < dev_work || work < unknown_dev)
|
||||
work = U64_MAX;
|
||||
work = min(work, capacity);
|
||||
|
||||
percent_full = div_u64(work * 100, capacity);
|
||||
|
||||
if (percent_full >= w->dev_most_full_percent) {
|
||||
w->dev_most_full_idx = idx;
|
||||
w->dev_most_full_percent = percent_full;
|
||||
w->dev_most_full_work = work;
|
||||
w->dev_most_full_capacity = capacity;
|
||||
}
|
||||
|
||||
if (w->total_work + dev_work >= w->total_work &&
|
||||
w->total_work + dev_work >= dev_work)
|
||||
w->total_work += dev_work;
|
||||
}
|
||||
|
||||
static struct rebalance_work rebalance_work(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
struct rebalance_work ret = { .dev_most_full_idx = -1 };
|
||||
u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
rebalance_work_accumulate(&ret,
|
||||
atomic64_read(&ca->rebalance_work),
|
||||
unknown_dev,
|
||||
bucket_to_sector(ca, ca->mi.nbuckets -
|
||||
ca->mi.first_bucket),
|
||||
i);
|
||||
|
||||
rebalance_work_accumulate(&ret,
|
||||
unknown_dev, 0, c->capacity, -1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void rebalance_work_reset(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
atomic64_set(&ca->rebalance_work, 0);
|
||||
|
||||
atomic64_set(&c->rebalance.work_unknown_dev, 0);
|
||||
}
|
||||
|
||||
static unsigned long curr_cputime(void)
|
||||
{
|
||||
u64 utime, stime;
|
||||
|
||||
task_cputime_adjusted(current, &utime, &stime);
|
||||
return nsecs_to_jiffies(utime + stime);
|
||||
}
|
||||
|
||||
static int bch2_rebalance_thread(void *arg)
|
||||
{
|
||||
struct bch_fs *c = arg;
|
||||
struct bch_fs_rebalance *r = &c->rebalance;
|
||||
struct io_clock *clock = &c->io_clock[WRITE];
|
||||
struct rebalance_work w, p;
|
||||
unsigned long start, prev_start;
|
||||
unsigned long prev_run_time, prev_run_cputime;
|
||||
unsigned long cputime, prev_cputime;
|
||||
unsigned long io_start;
|
||||
long throttle;
|
||||
|
||||
set_freezable();
|
||||
|
||||
io_start = atomic_long_read(&clock->now);
|
||||
p = rebalance_work(c);
|
||||
prev_start = jiffies;
|
||||
prev_cputime = curr_cputime();
|
||||
|
||||
while (!kthread_wait_freezable(r->enabled)) {
|
||||
start = jiffies;
|
||||
cputime = curr_cputime();
|
||||
|
||||
prev_run_time = start - prev_start;
|
||||
prev_run_cputime = cputime - prev_cputime;
|
||||
|
||||
w = rebalance_work(c);
|
||||
BUG_ON(!w.dev_most_full_capacity);
|
||||
|
||||
if (!w.total_work) {
|
||||
r->state = REBALANCE_WAITING;
|
||||
kthread_wait_freezable(rebalance_work(c).total_work);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there isn't much work to do, throttle cpu usage:
|
||||
*/
|
||||
throttle = prev_run_cputime * 100 /
|
||||
max(1U, w.dev_most_full_percent) -
|
||||
prev_run_time;
|
||||
|
||||
if (w.dev_most_full_percent < 20 && throttle > 0) {
|
||||
r->state = REBALANCE_THROTTLED;
|
||||
r->throttled_until_iotime = io_start +
|
||||
div_u64(w.dev_most_full_capacity *
|
||||
(20 - w.dev_most_full_percent),
|
||||
50);
|
||||
r->throttled_until_cputime = start + throttle;
|
||||
|
||||
bch2_kthread_io_clock_wait(clock,
|
||||
r->throttled_until_iotime,
|
||||
throttle);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* minimum 1 mb/sec: */
|
||||
r->pd.rate.rate =
|
||||
max_t(u64, 1 << 11,
|
||||
r->pd.rate.rate *
|
||||
max(p.dev_most_full_percent, 1U) /
|
||||
max(w.dev_most_full_percent, 1U));
|
||||
|
||||
io_start = atomic_long_read(&clock->now);
|
||||
p = w;
|
||||
prev_start = start;
|
||||
prev_cputime = cputime;
|
||||
|
||||
r->state = REBALANCE_RUNNING;
|
||||
memset(&r->move_stats, 0, sizeof(r->move_stats));
|
||||
rebalance_work_reset(c);
|
||||
|
||||
bch2_move_data(c,
|
||||
/* ratelimiting disabled for now */
|
||||
NULL, /* &r->pd.rate, */
|
||||
writepoint_ptr(&c->rebalance_write_point),
|
||||
POS_MIN, POS_MAX,
|
||||
rebalance_pred, NULL,
|
||||
&r->move_stats);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
|
||||
{
|
||||
char *out = buf, *end = out + PAGE_SIZE;
|
||||
struct bch_fs_rebalance *r = &c->rebalance;
|
||||
struct rebalance_work w = rebalance_work(c);
|
||||
char h1[21], h2[21];
|
||||
|
||||
bch2_hprint(h1, w.dev_most_full_work << 9);
|
||||
bch2_hprint(h2, w.dev_most_full_capacity << 9);
|
||||
out += scnprintf(out, end - out,
|
||||
"fullest_dev (%i):\t%s/%s\n",
|
||||
w.dev_most_full_idx, h1, h2);
|
||||
|
||||
bch2_hprint(h1, w.total_work << 9);
|
||||
bch2_hprint(h2, c->capacity << 9);
|
||||
out += scnprintf(out, end - out,
|
||||
"total work:\t\t%s/%s\n",
|
||||
h1, h2);
|
||||
|
||||
out += scnprintf(out, end - out,
|
||||
"rate:\t\t\t%u\n",
|
||||
r->pd.rate.rate);
|
||||
|
||||
switch (r->state) {
|
||||
case REBALANCE_WAITING:
|
||||
out += scnprintf(out, end - out, "waiting\n");
|
||||
break;
|
||||
case REBALANCE_THROTTLED:
|
||||
bch2_hprint(h1,
|
||||
(r->throttled_until_iotime -
|
||||
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
|
||||
out += scnprintf(out, end - out,
|
||||
"throttled for %lu sec or %s io\n",
|
||||
(r->throttled_until_cputime - jiffies) / HZ,
|
||||
h1);
|
||||
break;
|
||||
case REBALANCE_RUNNING:
|
||||
out += scnprintf(out, end - out, "running\n");
|
||||
out += scnprintf(out, end - out, "pos %llu:%llu\n",
|
||||
r->move_stats.iter.pos.inode,
|
||||
r->move_stats.iter.pos.offset);
|
||||
break;
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
void bch2_rebalance_stop(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
c->rebalance.pd.rate.rate = UINT_MAX;
|
||||
bch2_ratelimit_reset(&c->rebalance.pd.rate);
|
||||
|
||||
p = rcu_dereference_protected(c->rebalance.thread, 1);
|
||||
c->rebalance.thread = NULL;
|
||||
|
||||
if (p) {
|
||||
/* for sychronizing with rebalance_wakeup() */
|
||||
synchronize_rcu();
|
||||
|
||||
kthread_stop(p);
|
||||
put_task_struct(p);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_rebalance_start(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
if (c->opts.nochanges)
|
||||
return 0;
|
||||
|
||||
p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
|
||||
if (IS_ERR(p))
|
||||
return PTR_ERR(p);
|
||||
|
||||
get_task_struct(p);
|
||||
rcu_assign_pointer(c->rebalance.thread, p);
|
||||
wake_up_process(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_fs_rebalance_init(struct bch_fs *c)
|
||||
{
|
||||
bch2_pd_controller_init(&c->rebalance.pd);
|
||||
|
||||
atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
|
||||
}
|
@ -1,12 +1,14 @@
|
||||
#ifndef _BCACHEFS_TIER_H
|
||||
#define _BCACHEFS_TIER_H
|
||||
#ifndef _BCACHEFS_REBALANCE_H
|
||||
#define _BCACHEFS_REBALANCE_H
|
||||
|
||||
#include "rebalance_types.h"
|
||||
|
||||
static inline void rebalance_wakeup(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(c->rebalance_thread);
|
||||
p = rcu_dereference(c->rebalance.thread);
|
||||
if (p)
|
||||
wake_up_process(p);
|
||||
rcu_read_unlock();
|
||||
@ -16,8 +18,10 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_io_opts *);
|
||||
void bch2_rebalance_add_work(struct bch_fs *, u64);
|
||||
|
||||
ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
|
||||
|
||||
void bch2_rebalance_stop(struct bch_fs *);
|
||||
int bch2_rebalance_start(struct bch_fs *);
|
||||
void bch2_fs_rebalance_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_TIER_H */
|
||||
#endif /* _BCACHEFS_REBALANCE_H */
|
26
libbcachefs/rebalance_types.h
Normal file
26
libbcachefs/rebalance_types.h
Normal file
@ -0,0 +1,26 @@
|
||||
#ifndef _BCACHEFS_REBALANCE_TYPES_H
|
||||
#define _BCACHEFS_REBALANCE_TYPES_H
|
||||
|
||||
#include "move_types.h"
|
||||
|
||||
enum rebalance_state {
|
||||
REBALANCE_WAITING,
|
||||
REBALANCE_THROTTLED,
|
||||
REBALANCE_RUNNING,
|
||||
};
|
||||
|
||||
struct bch_fs_rebalance {
|
||||
struct task_struct __rcu *thread;
|
||||
struct bch_pd_controller pd;
|
||||
|
||||
atomic64_t work_unknown_dev;
|
||||
|
||||
enum rebalance_state state;
|
||||
unsigned long throttled_until_iotime;
|
||||
unsigned long throttled_until_cputime;
|
||||
struct bch_move_stats move_stats;
|
||||
|
||||
unsigned enabled:1;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
|
@ -146,6 +146,8 @@ struct six_lock_waiter {
|
||||
/* This is probably up there with the more evil things I've done */
|
||||
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
|
||||
|
||||
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
|
||||
|
||||
static inline int six_can_spin_on_owner(struct six_lock *lock)
|
||||
{
|
||||
struct task_struct *owner;
|
||||
@ -257,6 +259,15 @@ fail:
|
||||
return false;
|
||||
}
|
||||
|
||||
#else /* CONFIG_LOCK_SPIN_ON_OWNER */
|
||||
|
||||
static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
noinline
|
||||
static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
|
||||
{
|
||||
|
@ -624,7 +624,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
|
||||
bio_set_dev(bio, ca->disk_sb.bdev);
|
||||
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
|
||||
bio->bi_iter.bi_size =
|
||||
roundup(vstruct_bytes(sb),
|
||||
roundup((size_t) vstruct_bytes(sb),
|
||||
bdev_logical_block_size(ca->disk_sb.bdev));
|
||||
bio->bi_end_io = write_super_endio;
|
||||
bio->bi_private = ca;
|
||||
|
@ -73,11 +73,6 @@ static inline __u64 jset_magic(struct bch_fs *c)
|
||||
return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
|
||||
}
|
||||
|
||||
static inline __u64 pset_magic(struct bch_fs *c)
|
||||
{
|
||||
return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC);
|
||||
}
|
||||
|
||||
static inline __u64 bset_magic(struct bch_fs *c)
|
||||
{
|
||||
return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
|
||||
@ -136,4 +131,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
|
||||
};
|
||||
}
|
||||
|
||||
size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
|
||||
struct bch_sb_field *);
|
||||
|
||||
#endif /* _BCACHEFS_SUPER_IO_H */
|
||||
|
@ -33,11 +33,11 @@
|
||||
#include "migrate.h"
|
||||
#include "movinggc.h"
|
||||
#include "quota.h"
|
||||
#include "rebalance.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
#include "sysfs.h"
|
||||
#include "tier.h"
|
||||
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/blkdev.h>
|
||||
@ -398,10 +398,10 @@ err:
|
||||
|
||||
static void bch2_fs_free(struct bch_fs *c)
|
||||
{
|
||||
#define BCH_TIME_STAT(name) \
|
||||
bch2_time_stats_exit(&c->name##_time);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
||||
bch2_time_stats_exit(&c->times[i]);
|
||||
|
||||
bch2_fs_quota_exit(c);
|
||||
bch2_fs_fsio_exit(c);
|
||||
@ -565,10 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
init_rwsem(&c->gc_lock);
|
||||
|
||||
#define BCH_TIME_STAT(name) \
|
||||
bch2_time_stats_init(&c->name##_time);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
||||
bch2_time_stats_init(&c->times[i]);
|
||||
|
||||
bch2_fs_allocator_init(c);
|
||||
bch2_fs_rebalance_init(c);
|
||||
@ -592,14 +590,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
seqcount_init(&c->gc_pos_lock);
|
||||
|
||||
c->copy_gc_enabled = 1;
|
||||
c->rebalance_enabled = 1;
|
||||
c->rebalance_percent = 10;
|
||||
c->rebalance.enabled = 1;
|
||||
c->promote_whole_extents = true;
|
||||
|
||||
c->journal.write_time = &c->journal_write_time;
|
||||
c->journal.delay_time = &c->journal_delay_time;
|
||||
c->journal.blocked_time = &c->journal_blocked_time;
|
||||
c->journal.flush_seq_time = &c->journal_flush_seq_time;
|
||||
c->journal.write_time = &c->times[BCH_TIME_journal_write];
|
||||
c->journal.delay_time = &c->times[BCH_TIME_journal_delay];
|
||||
c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked];
|
||||
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
|
||||
|
||||
bch2_fs_btree_cache_init_early(&c->btree_cache);
|
||||
|
||||
@ -647,7 +644,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
BIOSET_NEED_BVECS) ||
|
||||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
|
||||
lg_lock_init(&c->usage_lock) ||
|
||||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
|
||||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
|
||||
btree_bytes(c)) ||
|
||||
bch2_io_clock_init(&c->io_clock[READ]) ||
|
||||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
|
||||
bch2_fs_journal_init(&c->journal) ||
|
||||
|
@ -24,9 +24,9 @@
|
||||
#include "keylist.h"
|
||||
#include "move.h"
|
||||
#include "opts.h"
|
||||
#include "rebalance.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
#include "tier.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/sort.h>
|
||||
@ -183,8 +183,8 @@ rw_attribute(copy_gc_enabled);
|
||||
sysfs_pd_controller_attribute(copy_gc);
|
||||
|
||||
rw_attribute(rebalance_enabled);
|
||||
rw_attribute(rebalance_percent);
|
||||
sysfs_pd_controller_attribute(rebalance);
|
||||
read_attribute(rebalance_work);
|
||||
rw_attribute(promote_whole_extents);
|
||||
|
||||
rw_attribute(pd_controllers_update_seconds);
|
||||
@ -198,11 +198,11 @@ read_attribute(data_replicas_have);
|
||||
BCH_DEBUG_PARAMS()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
#define BCH_TIME_STAT(_name) \
|
||||
#define x(_name) \
|
||||
static struct attribute sysfs_time_stat_##_name = \
|
||||
{ .name = #_name, .mode = S_IRUGO };
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
#undef x
|
||||
|
||||
static struct attribute sysfs_state_rw = {
|
||||
.name = "state",
|
||||
@ -340,9 +340,11 @@ SHOW(bch2_fs)
|
||||
sysfs_print(pd_controllers_update_seconds,
|
||||
c->pd_controllers_update_seconds);
|
||||
|
||||
sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
|
||||
sysfs_print(rebalance_percent, c->rebalance_percent);
|
||||
sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
|
||||
sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
|
||||
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
|
||||
|
||||
if (attr == &sysfs_rebalance_work)
|
||||
return bch2_rebalance_work_show(c, buf);
|
||||
|
||||
sysfs_print(promote_whole_extents, c->promote_whole_extents);
|
||||
|
||||
@ -404,7 +406,7 @@ STORE(__bch2_fs)
|
||||
}
|
||||
|
||||
if (attr == &sysfs_rebalance_enabled) {
|
||||
ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
|
||||
ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
|
||||
?: (ssize_t) size;
|
||||
|
||||
rebalance_wakeup(c);
|
||||
@ -413,9 +415,7 @@ STORE(__bch2_fs)
|
||||
|
||||
sysfs_strtoul(pd_controllers_update_seconds,
|
||||
c->pd_controllers_update_seconds);
|
||||
|
||||
sysfs_strtoul(rebalance_percent, c->rebalance_percent);
|
||||
sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
|
||||
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
|
||||
|
||||
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
|
||||
|
||||
@ -474,7 +474,6 @@ struct attribute *bch2_fs_files[] = {
|
||||
&sysfs_journal_write_delay_ms,
|
||||
&sysfs_journal_reclaim_delay_ms,
|
||||
|
||||
&sysfs_rebalance_percent,
|
||||
&sysfs_promote_whole_extents,
|
||||
|
||||
&sysfs_compression_stats,
|
||||
@ -513,8 +512,11 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
&sysfs_prune_cache,
|
||||
|
||||
&sysfs_copy_gc_enabled,
|
||||
|
||||
&sysfs_rebalance_enabled,
|
||||
&sysfs_rebalance_work,
|
||||
sysfs_pd_controller_files(rebalance),
|
||||
|
||||
&sysfs_internal_uuid,
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
|
||||
@ -613,11 +615,12 @@ SHOW(bch2_fs_time_stats)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
|
||||
|
||||
#define BCH_TIME_STAT(name) \
|
||||
#define x(name) \
|
||||
if (attr == &sysfs_time_stat_##name) \
|
||||
return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
|
||||
return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
|
||||
buf, PAGE_SIZE);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
#undef x
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -629,10 +632,10 @@ STORE(bch2_fs_time_stats)
|
||||
SYSFS_OPS(bch2_fs_time_stats);
|
||||
|
||||
struct attribute *bch2_fs_time_stats_files[] = {
|
||||
#define BCH_TIME_STAT(name) \
|
||||
#define x(name) \
|
||||
&sysfs_time_stat_##name,
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
|
@ -1,259 +0,0 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc.h"
|
||||
#include "btree_iter.h"
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "disk_groups.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "move.h"
|
||||
#include "super-io.h"
|
||||
#include "tier.h"
|
||||
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
static inline bool rebalance_ptr_pred(struct bch_fs *c,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
struct bch_extent_crc_unpacked crc,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (io_opts->background_target &&
|
||||
!dev_in_target(ca, io_opts->background_target) &&
|
||||
!ptr->cached)
|
||||
return true;
|
||||
|
||||
if (io_opts->background_compression &&
|
||||
crc.compression_type !=
|
||||
bch2_compression_opt_to_type[io_opts->background_compression])
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_key(struct bch_fs *c,
|
||||
struct bkey_s_c k,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bkey_s_c_extent e;
|
||||
|
||||
if (!bkey_extent_is_data(k.k))
|
||||
return;
|
||||
|
||||
if (!io_opts->background_target &&
|
||||
!io_opts->background_compression)
|
||||
return;
|
||||
|
||||
e = bkey_s_c_to_extent(k);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (!atomic64_add_return(crc.compressed_size,
|
||||
&ca->rebalance_work))
|
||||
rebalance_wakeup(c);
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
|
||||
{
|
||||
if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
|
||||
rebalance_wakeup(c);
|
||||
}
|
||||
|
||||
static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
/* Make sure we have room to add a new pointer: */
|
||||
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
|
||||
BKEY_EXTENT_VAL_U64s_MAX)
|
||||
return DATA_SKIP;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (rebalance_ptr_pred(c, ptr, crc, io_opts))
|
||||
goto found;
|
||||
|
||||
return DATA_SKIP;
|
||||
found:
|
||||
data_opts->target = io_opts->background_target;
|
||||
data_opts->btree_insert_flags = 0;
|
||||
return DATA_ADD_REPLICAS;
|
||||
}
|
||||
|
||||
struct rebalance_work {
|
||||
unsigned dev_most_full_percent;
|
||||
u64 dev_most_full_work;
|
||||
u64 dev_most_full_capacity;
|
||||
u64 total_work;
|
||||
};
|
||||
|
||||
static struct rebalance_work rebalance_work(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
struct rebalance_work ret = { 0 };
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i) {
|
||||
u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
|
||||
ca->mi.first_bucket);
|
||||
u64 work = atomic64_read(&ca->rebalance_work) +
|
||||
atomic64_read(&c->rebalance_work_unknown_dev);
|
||||
unsigned percent_full = div_u64(work * 100, capacity);
|
||||
|
||||
if (percent_full > ret.dev_most_full_percent) {
|
||||
ret.dev_most_full_percent = percent_full;
|
||||
ret.dev_most_full_work = work;
|
||||
ret.dev_most_full_capacity = capacity;
|
||||
}
|
||||
|
||||
ret.total_work += atomic64_read(&ca->rebalance_work);
|
||||
}
|
||||
|
||||
ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void rebalance_work_reset(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
atomic64_set(&ca->rebalance_work, 0);
|
||||
|
||||
atomic64_set(&c->rebalance_work_unknown_dev, 0);
|
||||
}
|
||||
|
||||
static unsigned long curr_cputime(void)
|
||||
{
|
||||
u64 utime, stime;
|
||||
|
||||
task_cputime_adjusted(current, &utime, &stime);
|
||||
return nsecs_to_jiffies(utime + stime);
|
||||
}
|
||||
|
||||
static int bch2_rebalance_thread(void *arg)
|
||||
{
|
||||
struct bch_fs *c = arg;
|
||||
struct io_clock *clock = &c->io_clock[WRITE];
|
||||
struct rebalance_work w, p;
|
||||
unsigned long start, prev_start;
|
||||
unsigned long prev_run_time, prev_run_cputime;
|
||||
unsigned long cputime, prev_cputime;
|
||||
|
||||
set_freezable();
|
||||
|
||||
p = rebalance_work(c);
|
||||
prev_start = jiffies;
|
||||
prev_cputime = curr_cputime();
|
||||
|
||||
while (!kthread_wait_freezable(c->rebalance_enabled)) {
|
||||
struct bch_move_stats move_stats = { 0 };
|
||||
|
||||
w = rebalance_work(c);
|
||||
start = jiffies;
|
||||
cputime = curr_cputime();
|
||||
|
||||
prev_run_time = start - prev_start;
|
||||
prev_run_cputime = cputime - prev_cputime;
|
||||
|
||||
if (!w.total_work) {
|
||||
kthread_wait_freezable(rebalance_work(c).total_work);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (w.dev_most_full_percent < 20 &&
|
||||
prev_run_cputime * 5 > prev_run_time) {
|
||||
if (w.dev_most_full_capacity) {
|
||||
bch2_kthread_io_clock_wait(clock,
|
||||
atomic_long_read(&clock->now) +
|
||||
div_u64(w.dev_most_full_capacity, 5));
|
||||
} else {
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
if (kthread_should_stop())
|
||||
break;
|
||||
|
||||
schedule_timeout(prev_run_cputime * 5 -
|
||||
prev_run_time);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* minimum 1 mb/sec: */
|
||||
c->rebalance_pd.rate.rate =
|
||||
max_t(u64, 1 << 11,
|
||||
c->rebalance_pd.rate.rate *
|
||||
max(p.dev_most_full_percent, 1U) /
|
||||
max(w.dev_most_full_percent, 1U));
|
||||
|
||||
rebalance_work_reset(c);
|
||||
|
||||
bch2_move_data(c, &c->rebalance_pd.rate,
|
||||
writepoint_ptr(&c->rebalance_write_point),
|
||||
POS_MIN, POS_MAX,
|
||||
rebalance_pred, NULL,
|
||||
&move_stats);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_rebalance_stop(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
c->rebalance_pd.rate.rate = UINT_MAX;
|
||||
bch2_ratelimit_reset(&c->rebalance_pd.rate);
|
||||
|
||||
p = c->rebalance_thread;
|
||||
c->rebalance_thread = NULL;
|
||||
|
||||
if (p) {
|
||||
/* for sychronizing with rebalance_wakeup() */
|
||||
synchronize_rcu();
|
||||
|
||||
kthread_stop(p);
|
||||
put_task_struct(p);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_rebalance_start(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
if (c->opts.nochanges)
|
||||
return 0;
|
||||
|
||||
p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
|
||||
if (IS_ERR(p))
|
||||
return PTR_ERR(p);
|
||||
|
||||
get_task_struct(p);
|
||||
|
||||
rcu_assign_pointer(c->rebalance_thread, p);
|
||||
wake_up_process(c->rebalance_thread);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_fs_rebalance_init(struct bch_fs *c)
|
||||
{
|
||||
bch2_pd_controller_init(&c->rebalance_pd);
|
||||
|
||||
atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
|
||||
}
|
@ -203,7 +203,7 @@ bool bch2_is_zero(const void *_p, size_t n)
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_quantiles_update(struct quantiles *q, u64 v)
|
||||
static void bch2_quantiles_update(struct quantiles *q, u64 v)
|
||||
{
|
||||
unsigned i = 0;
|
||||
|
||||
@ -569,6 +569,23 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bv;
|
||||
|
||||
bio_for_each_segment_all(bv, bio, i) {
|
||||
bv->bv_page = alloc_page(gfp_mask);
|
||||
if (!bv->bv_page) {
|
||||
while (--bv >= bio->bi_io_vec)
|
||||
__free_page(bv->bv_page);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t bch2_rand_range(size_t max)
|
||||
{
|
||||
size_t rand;
|
||||
@ -771,20 +788,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
|
||||
}
|
||||
}
|
||||
|
||||
void mempool_free_vp(void *element, void *pool_data)
|
||||
static void mempool_free_vp(void *element, void *pool_data)
|
||||
{
|
||||
size_t size = (size_t) pool_data;
|
||||
|
||||
vpfree(element, size);
|
||||
}
|
||||
|
||||
void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
|
||||
static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
|
||||
{
|
||||
size_t size = (size_t) pool_data;
|
||||
|
||||
return vpmalloc(size, gfp_mask);
|
||||
}
|
||||
|
||||
int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
|
||||
{
|
||||
return size < PAGE_SIZE
|
||||
? mempool_init_kmalloc_pool(pool, min_nr, size)
|
||||
: mempool_init(pool, min_nr, mempool_alloc_vp,
|
||||
mempool_free_vp, (void *) size);
|
||||
}
|
||||
|
||||
#if 0
|
||||
void eytzinger1_test(void)
|
||||
{
|
||||
|
@ -68,9 +68,9 @@ struct closure;
|
||||
#define __flatten
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define CPU_BIG_ENDIAN 0
|
||||
#else
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#define CPU_BIG_ENDIAN 1
|
||||
#endif
|
||||
|
||||
@ -113,14 +113,7 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
|
||||
: vpmalloc(size, gfp_mask);
|
||||
}
|
||||
|
||||
void mempool_free_vp(void *element, void *pool_data);
|
||||
void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
|
||||
|
||||
static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
|
||||
{
|
||||
return mempool_init(pool, min_nr, mempool_alloc_vp,
|
||||
mempool_free_vp, (void *) size);
|
||||
}
|
||||
int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
|
||||
|
||||
#define HEAP(type) \
|
||||
struct { \
|
||||
@ -610,6 +603,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
|
||||
}
|
||||
|
||||
void bch2_bio_map(struct bio *bio, void *base);
|
||||
int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
|
||||
|
||||
static inline sector_t bdev_sectors(struct block_device *bdev)
|
||||
{
|
||||
|
@ -5,8 +5,8 @@
|
||||
#include "compress.h"
|
||||
#include "extents.h"
|
||||
#include "fs.h"
|
||||
#include "rebalance.h"
|
||||
#include "str_hash.h"
|
||||
#include "tier.h"
|
||||
#include "xattr.h"
|
||||
|
||||
#include <linux/dcache.h>
|
||||
|
@ -40,14 +40,22 @@ void schedule(void)
|
||||
v, NULL, NULL, 0);
|
||||
}
|
||||
|
||||
static void process_timeout(unsigned long __data)
|
||||
struct process_timer {
|
||||
struct timer_list timer;
|
||||
struct task_struct *task;
|
||||
};
|
||||
|
||||
static void process_timeout(struct timer_list *t)
|
||||
{
|
||||
wake_up_process((struct task_struct *)__data);
|
||||
struct process_timer *timeout =
|
||||
container_of(t, struct process_timer, timer);
|
||||
|
||||
wake_up_process(timeout->task);
|
||||
}
|
||||
|
||||
long schedule_timeout(long timeout)
|
||||
{
|
||||
struct timer_list timer;
|
||||
struct process_timer timer;
|
||||
unsigned long expire;
|
||||
|
||||
switch (timeout)
|
||||
@ -80,10 +88,11 @@ long schedule_timeout(long timeout)
|
||||
|
||||
expire = timeout + jiffies;
|
||||
|
||||
setup_timer(&timer, process_timeout, (unsigned long)current);
|
||||
mod_timer(&timer, expire);
|
||||
timer.task = current;
|
||||
timer_setup_on_stack(&timer.timer, process_timeout, 0);
|
||||
mod_timer(&timer.timer, expire);
|
||||
schedule();
|
||||
del_timer_sync(&timer);
|
||||
del_timer_sync(&timer.timer);
|
||||
|
||||
timeout = expire - jiffies;
|
||||
out:
|
||||
|
@ -273,7 +273,7 @@ static int timer_thread(void *arg)
|
||||
BUG_ON(!timer_running());
|
||||
|
||||
pthread_mutex_unlock(&timer_lock);
|
||||
timer->function(timer->data);
|
||||
timer->function(timer);
|
||||
pthread_mutex_lock(&timer_lock);
|
||||
|
||||
timer_seq++;
|
||||
|
@ -55,9 +55,10 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
|
||||
return ret;
|
||||
}
|
||||
|
||||
void delayed_work_timer_fn(unsigned long __data)
|
||||
void delayed_work_timer_fn(struct timer_list *timer)
|
||||
{
|
||||
struct delayed_work *dwork = (struct delayed_work *) __data;
|
||||
struct delayed_work *dwork =
|
||||
container_of(timer, struct delayed_work, timer);
|
||||
|
||||
pthread_mutex_lock(&wq_lock);
|
||||
__queue_work(dwork->wq, &dwork->work);
|
||||
@ -71,8 +72,7 @@ static void __queue_delayed_work(struct workqueue_struct *wq,
|
||||
struct timer_list *timer = &dwork->timer;
|
||||
struct work_struct *work = &dwork->work;
|
||||
|
||||
BUG_ON(timer->function != delayed_work_timer_fn ||
|
||||
timer->data != (unsigned long)dwork);
|
||||
BUG_ON(timer->function != delayed_work_timer_fn);
|
||||
BUG_ON(timer_pending(timer));
|
||||
BUG_ON(!list_empty(&work->entry));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user