Update bcachefs sources to 0906b1fb49 bcachefs: fixes for 32 bit/big endian machines

This commit is contained in:
Kent Overstreet 2018-05-17 01:38:57 -04:00
parent 800408be11
commit ff86d47221
65 changed files with 1237 additions and 791 deletions

View File

@ -1 +1 @@
ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
0906b1fb492e8e84f563b192fd8f458af1c1d420

View File

@ -36,10 +36,12 @@ static void usage(void)
" fsck Check an existing filesystem for errors\n"
"\n"
"Startup/shutdown, assembly of multi device filesystems:\n"
#if 0
" assemble Assemble an existing multi device filesystem\n"
" incremental Incrementally assemble an existing multi device filesystem\n"
" run Start a partially assembled filesystem\n"
" stop Stop a running filesystem\n"
#endif
"\n"
"Commands for managing a running filesystem:\n"
" fs usage Show disk usage\n"
@ -150,6 +152,7 @@ int main(int argc, char *argv[])
if (!strcmp(cmd, "fsck"))
return cmd_fsck(argc, argv);
#if 0
if (!strcmp(cmd, "assemble"))
return cmd_assemble(argc, argv);
if (!strcmp(cmd, "incremental"))
@ -158,6 +161,7 @@ int main(int argc, char *argv[])
return cmd_run(argc, argv);
if (!strcmp(cmd, "stop"))
return cmd_stop(argc, argv);
#endif
if (!strcmp(cmd, "fs"))
return fs_cmds(argc, argv);

View File

@ -11,6 +11,7 @@
#include "cmds.h"
#include "libbcachefs.h"
#if 0
int cmd_assemble(int argc, char *argv[])
{
unsigned nr_devs = argc - 1;
@ -26,7 +27,7 @@ int cmd_assemble(int argc, char *argv[])
unsigned i;
for (i = 0; i < nr_devs; i++)
assemble->devs[i] = (__u64) argv[i + 1];
assemble->devs[i] = (unsigned long) argv[i + 1];
xioctl(bcachectl_open(), BCH_IOCTL_ASSEMBLE, assemble);
return 0;
@ -38,9 +39,10 @@ int cmd_incremental(int argc, char *argv[])
die("Please supply exactly one device");
struct bch_ioctl_incremental incremental = {
.dev = (__u64) argv[1],
.dev = (unsigned long) argv[1],
};
xioctl(bcachectl_open(), BCH_IOCTL_INCREMENTAL, &incremental);
return 0;
}
#endif

View File

@ -10,6 +10,7 @@
#include "libbcachefs/bcachefs.h"
#include "libbcachefs/alloc.h"
#include "libbcachefs/bset.h"
#include "libbcachefs/btree_cache.h"
#include "libbcachefs/btree_iter.h"
#include "libbcachefs/buckets.h"

View File

@ -15,6 +15,7 @@
#include "cmds.h"
#include "libbcachefs.h"
#if 0
int cmd_run(int argc, char *argv[])
{
return 0;
@ -29,3 +30,4 @@ int cmd_stop(int argc, char *argv[])
xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
return 0;
}
#endif

2
cmds.h
View File

@ -12,10 +12,12 @@
int cmd_format(int argc, char *argv[]);
int cmd_show_super(int argc, char *argv[]);
#if 0
int cmd_assemble(int argc, char *argv[]);
int cmd_incremental(int argc, char *argv[]);
int cmd_run(int argc, char *argv[]);
int cmd_stop(int argc, char *argv[]);
#endif
int cmd_fs_usage(int argc, char *argv[]);

View File

@ -6,27 +6,22 @@
struct timer_list {
unsigned long expires;
void (*function)(unsigned long);
unsigned long data;
void (*function)(struct timer_list *timer);
bool pending;
};
static inline void init_timer(struct timer_list *timer)
static inline void timer_setup(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags)
{
memset(timer, 0, sizeof(*timer));
timer->function = func;
}
#define __init_timer(_timer, _flags) init_timer(_timer)
#define timer_setup_on_stack(timer, callback, flags) \
timer_setup(timer, callback, flags)
#define __setup_timer(_timer, _fn, _data, _flags) \
do { \
__init_timer((_timer), (_flags)); \
(_timer)->function = (_fn); \
(_timer)->data = (_data); \
} while (0)
#define setup_timer(timer, fn, data) \
__setup_timer((timer), (fn), (data), 0)
#define destroy_timer_on_stack(timer) do {} while (0)
static inline int timer_pending(const struct timer_list *timer)
{
@ -36,8 +31,9 @@ static inline int timer_pending(const struct timer_list *timer)
int del_timer(struct timer_list * timer);
int del_timer_sync(struct timer_list *timer);
#define del_singleshot_timer_sync(timer) del_timer_sync(timer)
int mod_timer(struct timer_list *timer, unsigned long expires);
//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
static inline void add_timer(struct timer_list *timer)
{

View File

@ -8,7 +8,7 @@ struct task_struct;
struct workqueue_struct;
struct work_struct;
typedef void (*work_func_t)(struct work_struct *work);
void delayed_work_timer_fn(unsigned long __data);
void delayed_work_timer_fn(struct timer_list *);
#define work_data_bits(work) ((unsigned long *)(&(work)->data))
@ -44,9 +44,7 @@ struct delayed_work {
#define INIT_DELAYED_WORK(_work, _func) \
do { \
INIT_WORK(&(_work)->work, (_func)); \
__setup_timer(&(_work)->timer, delayed_work_timer_fn, \
(unsigned long)(_work), \
TIMER_IRQSAFE); \
timer_setup(&(_work)->timer, delayed_work_timer_fn, 0); \
} while (0)
static inline struct delayed_work *to_delayed_work(struct work_struct *work)

View File

@ -1393,13 +1393,11 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
{
int i;
for (i = wp->first_ptr - 1; i >= 0; --i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
if (dev_in_target(ca, target) == in_target)
for (i = wp->first_ptr - 1; i >= 0; --i)
if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
target) == in_target)
writepoint_drop_ptr(c, wp, i);
}
}
static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
{
@ -1555,7 +1553,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
/* does writepoint have ptrs we don't want to use? */
if (target)
writepoint_for_each_ptr(wp, ob, i)
if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
}
@ -1590,7 +1588,8 @@ alloc_done:
* one in the target we want:
*/
if (cache_idx >= 0) {
if (!dev_in_target(ca, target)) {
if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
target)) {
writepoint_drop_ptr(c, wp, i);
} else {
writepoint_drop_ptr(c, wp, cache_idx);
@ -1621,7 +1620,7 @@ alloc_done:
if (ca->mi.durability &&
ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
!dev_idx_in_target(c, ob->ptr.dev, target)) {
!bch2_dev_in_target(c, ob->ptr.dev, target)) {
swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
wp->first_ptr++;
nr_ptrs_effective -= ca->mi.durability;
@ -1890,8 +1889,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
/* stop allocator thread: */
void bch2_dev_allocator_stop(struct bch_dev *ca)
{
struct task_struct *p = ca->alloc_thread;
struct task_struct *p;
p = rcu_dereference_protected(ca->alloc_thread, 1);
ca->alloc_thread = NULL;
/*
@ -1926,7 +1926,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return PTR_ERR(p);
get_task_struct(p);
ca->alloc_thread = p;
rcu_assign_pointer(ca->alloc_thread, p);
wake_up_process(p);
return 0;
}
@ -2099,7 +2099,7 @@ again:
if (btree_node_dirty(b) && (!b->written || b->level)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
six_lock_read(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;

View File

@ -103,7 +103,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
struct task_struct *p;
rcu_read_lock();
if ((p = READ_ONCE(ca->alloc_thread)))
p = rcu_dereference(ca->alloc_thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}

View File

@ -197,7 +197,6 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
#include "bset.h"
#include "fifo.h"
#include "opts.h"
#include "util.h"
@ -272,25 +271,37 @@ do { \
#endif
#define BCH_TIME_STATS() \
BCH_TIME_STAT(btree_node_mem_alloc) \
BCH_TIME_STAT(btree_gc) \
BCH_TIME_STAT(btree_split) \
BCH_TIME_STAT(btree_sort) \
BCH_TIME_STAT(btree_read) \
BCH_TIME_STAT(data_write) \
BCH_TIME_STAT(data_read) \
BCH_TIME_STAT(data_promote) \
BCH_TIME_STAT(journal_write) \
BCH_TIME_STAT(journal_delay) \
BCH_TIME_STAT(journal_blocked) \
BCH_TIME_STAT(journal_flush_seq)
x(btree_node_mem_alloc) \
x(btree_gc) \
x(btree_split) \
x(btree_sort) \
x(btree_read) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(btree_lock_contended_write) \
x(data_write) \
x(data_read) \
x(data_promote) \
x(journal_write) \
x(journal_delay) \
x(journal_blocked) \
x(journal_flush_seq)
enum bch_time_stats {
#define x(name) BCH_TIME_##name,
BCH_TIME_STATS()
#undef x
BCH_TIME_STAT_NR
};
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
#include "clock_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "quota_types.h"
#include "rebalance_types.h"
#include "super_types.h"
/*
@ -372,7 +383,7 @@ struct bch_dev {
struct bch_dev_usage usage_cached;
/* Allocator: */
struct task_struct *alloc_thread;
struct task_struct __rcu *alloc_thread;
/*
* free: Buckets that are ready to be used
@ -447,7 +458,6 @@ enum {
/* shutdown: */
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_GC_STOPPING,
/* errors: */
BCH_FS_ERROR,
@ -570,12 +580,6 @@ struct bch_fs {
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
/* REBALANCE */
struct task_struct *rebalance_thread;
struct bch_pd_controller rebalance_pd;
atomic64_t rebalance_work_unknown_dev;
struct bch_devs_mask rw_devs[BCH_DATA_NR];
u64 capacity; /* sectors */
@ -664,6 +668,9 @@ struct bch_fs {
atomic64_t key_version;
/* REBALANCE */
struct bch_fs_rebalance rebalance;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
@ -714,18 +721,13 @@ struct bch_fs {
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
unsigned rebalance_enabled:1;
unsigned rebalance_percent;
bool promote_whole_extents;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
#define BCH_TIME_STAT(name) \
struct time_stats name##_time;
BCH_TIME_STATS()
#undef BCH_TIME_STAT
struct time_stats times[BCH_TIME_STAT_NR];
};
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)

View File

@ -3,6 +3,72 @@
/*
* bcachefs on disk data structures
*
* OVERVIEW:
*
* There are three main types of on disk data structures in bcachefs (this is
* reduced from 5 in bcache)
*
* - superblock
* - journal
* - btree
*
* The btree is the primary structure; most metadata exists as keys in the
* various btrees. There are only a small number of btrees, they're not
* sharded - we have one btree for extents, another for inodes, et cetera.
*
* SUPERBLOCK:
*
* The superblock contains the location of the journal, the list of devices in
* the filesystem, and in general any metadata we need in order to decide
* whether we can start a filesystem or prior to reading the journal/btree
* roots.
*
* The superblock is extensible, and most of the contents of the superblock are
* in variable length, type tagged fields; see struct bch_sb_field.
*
* Backup superblocks do not reside in a fixed location; also, superblocks do
* not have a fixed size. To locate backup superblocks we have struct
* bch_sb_layout; we store a copy of this inside every superblock, and also
* before the first superblock.
*
* JOURNAL:
*
* The journal primarily records btree updates in the order they occurred;
* journal replay consists of just iterating over all the keys in the open
* journal entries and re-inserting them into the btrees.
*
* The journal also contains entry types for the btree roots, and blacklisted
* journal sequence numbers (see journal_seq_blacklist.c).
*
* BTREE:
*
* bcachefs btrees are copy on write b+ trees, where nodes are big (typically
* 128k-256k) and log structured. We use struct btree_node for writing the first
* entry in a given node (offset 0), and struct btree_node_entry for all
* subsequent writes.
*
* After the header, btree node entries contain a list of keys in sorted order.
* Values are stored inline with the keys; since values are variable length (and
* keys effectively are variable length too, due to packing) we can't do random
* access without building up additional in memory tables in the btree node read
* path.
*
* BTREE KEYS (struct bkey):
*
* The various btrees share a common format for the key - so as to avoid
* switching in fastpath lookup/comparison code - but define their own
* structures for the key values.
*
* The size of a key/value pair is stored as a u8 in units of u64s, so the max
* size is just under 2k. The common part also contains a type tag for the
* value, and a format field indicating whether the key is packed or not (and
* also meant to allow adding new key fields in the future, if desired).
*
* bkeys, when stored within a btree node, may also be packed. In that case, the
* bkey_format in that node is used to unpack it. Packed bkeys mean that we can
* be generous with field sizes in the common part of the key format (64 bit
* inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
*/
#include <asm/types.h>
@ -44,12 +110,19 @@ struct bkey_format {
/* Btree keys - all units are in sectors */
struct bpos {
/* Word order matches machine byte order */
#if defined(__LITTLE_ENDIAN)
/*
* Word order matches machine byte order - btree code treats a bpos as a
* single large integer, for search/comparison purposes
*
* Note that wherever a bpos is embedded in another on disk data
* structure, it has to be byte swabbed when reading in metadata that
* wasn't written in native endian order:
*/
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u32 snapshot;
__u64 offset;
__u64 inode;
#elif defined(__BIG_ENDIAN)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u64 inode;
__u64 offset; /* Points to end of extent - sectors */
__u32 snapshot;
@ -83,10 +156,10 @@ struct bch_val {
};
struct bversion {
#if defined(__LITTLE_ENDIAN)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u64 lo;
__u32 hi;
#elif defined(__BIG_ENDIAN)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u32 hi;
__u64 lo;
#endif
@ -110,13 +183,13 @@ struct bkey {
/* Type of the value */
__u8 type;
#if defined(__LITTLE_ENDIAN)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
struct bversion version;
__u32 size; /* extent size, in sectors */
struct bpos p;
#elif defined(__BIG_ENDIAN)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
struct bversion version;
@ -275,10 +348,10 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE);
*
* If an extent is not checksummed or compressed, when the extent is trimmed we
* don't have to remember the extent we originally allocated and wrote: we can
* merely adjust ptr->offset to point to the start of the start of the data that
* is currently live. The size field in struct bkey records the current (live)
* size of the extent, and is also used to mean "size of region on disk that we
* point to" in this case.
* merely adjust ptr->offset to point to the start of the data that is currently
* live. The size field in struct bkey records the current (live) size of the
* extent, and is also used to mean "size of region on disk that we point to" in
* this case.
*
* Thus an extent that is not checksummed or compressed will consist only of a
* list of bch_extent_ptrs, with none of the fields in
@ -446,11 +519,11 @@ struct bch_extent_crc128 {
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 compression_type:4,
csum_type:4,
nonce:14,
nonce:13,
offset:13,
_uncompressed_size:13,
_compressed_size:13,
type:3;
type:4;
#endif
struct bch_csum csum;
} __attribute__((packed, aligned(8)));
@ -496,7 +569,7 @@ struct bch_extent_reservation {
};
union bch_extent_entry {
#if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
unsigned long type;
#elif __BITS_PER_LONG == 32
struct {
@ -551,10 +624,11 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION);
sizeof(struct bch_extent_ptr)) / sizeof(u64))
/* Maximum possible size of an entire extent value: */
/* There's a hack in the keylist code that needs to be fixed.. */
#define BKEY_EXTENT_VAL_U64s_MAX \
(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@ -1378,33 +1452,4 @@ struct btree_node_entry {
};
} __attribute__((packed, aligned(8)));
/* Obsolete: */
struct prio_set {
struct bch_csum csum;
__le64 magic;
__le32 nonce[3];
__le16 version;
__le16 flags;
__u8 encrypted_start[0];
__le64 next_bucket;
struct bucket_disk {
__le16 prio[2];
__u8 gen;
} __attribute__((packed)) data[];
} __attribute__((packed, aligned(8)));
LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL)
static inline __u64 __pset_magic(struct bch_sb *sb)
{
return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
}
#endif /* _BCACHEFS_FORMAT_H */

View File

@ -5,6 +5,9 @@
#include <asm/ioctl.h>
#include "bcachefs_format.h"
/*
* Flags common to multiple ioctls:
*/
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
@ -14,12 +17,23 @@
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
/*
* If cleared, ioctl that refer to a device pass it as a pointer to a pathname
* (e.g. /dev/sda1); if set, the dev field is the device's index within the
* filesystem:
*/
#define BCH_BY_INDEX (1 << 4)
/*
* For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
* wide superblock:
*/
#define BCH_READ_DEV (1 << 5)
/* global control dev: */
/* These are currently broken, and probably unnecessary: */
#if 0
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
@ -35,12 +49,18 @@ struct bch_ioctl_incremental {
__u64 pad;
__u64 dev;
};
#endif
/* filesystem ioctls: */
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
/* These only make sense when we also have incremental assembly */
#if 0
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
#define BCH_IOCTL_STOP _IO(0xbc, 3)
#endif
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
@ -52,14 +72,70 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
/*
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
*
* Returns user visible UUID, not internal UUID (which may not ever be changed);
* the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
* this UUID.
*/
struct bch_ioctl_query_uuid {
uuid_le uuid;
};
#if 0
struct bch_ioctl_start {
__u32 flags;
__u32 pad;
};
#endif
/*
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
*
* The specified device must not be open or in use. On success, the new device
* will be an online member of the filesystem just like any other member.
*
* The device must first be prepared by userspace by formatting with a bcachefs
* superblock, which is only used for passing in superblock options/parameters
* for that device (in struct bch_member). The new device's superblock should
* not claim to be a member of any existing filesystem - UUIDs on it will be
* ignored.
*/
/*
* BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
*
* Any data present on @dev will be permanently deleted, and @dev will be
* removed from its slot in the filesystem's list of member devices. The device
* may be either offline or offline.
*
* Will fail removing @dev would leave us with insufficient read write devices
* or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
* set.
*/
/*
* BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
* but is not open (e.g. because we started in degraded mode), bring it online
*
* all existing data on @dev will be available once the device is online,
* exactly as if @dev was present when the filesystem was first mounted
*/
/*
* BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
* block device, without removing it from the filesystem (so it can be brought
* back online later)
*
* Data present on @dev will be unavailable while @dev is offline (unless
* replicated), but will still be intact and untouched if @dev is brought back
* online
*
* Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
* leave us with insufficient read write devices or degraded/unavailable data,
* unless the approprate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk {
__u32 flags;
@ -67,6 +143,16 @@ struct bch_ioctl_disk {
__u64 dev;
};
/*
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
*
* @new_state - one of the bch_member_state states (rw, ro, failed,
* spare)
*
* Will refuse to change member state if we would then have insufficient devices
* to write to, or if it would result in degraded data (when @new_state is
* failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk_set_state {
__u32 flags;
__u8 new_state;
@ -81,6 +167,15 @@ enum bch_data_ops {
BCH_DATA_OP_NR = 3,
};
/*
* BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
* scrub, rereplicate, migrate).
*
* This ioctl kicks off a job in the background, and returns a file descriptor.
* Reading from the file descriptor returns a struct bch_ioctl_data_event,
* indicating current progress, and closing the file descriptor will stop the
* job. The file descriptor is O_CLOEXEC.
*/
struct bch_ioctl_data {
__u32 op;
__u32 flags;
@ -93,9 +188,18 @@ struct bch_ioctl_data {
__u32 dev;
__u32 pad;
} migrate;
struct {
__u64 pad[8];
};
};
} __attribute__((packed, aligned(8)));
enum bch_data_event {
BCH_DATA_EVENT_PROGRESS = 0,
/* XXX: add an event for reporting errors */
BCH_DATA_EVENT_NR = 1,
};
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
@ -106,6 +210,15 @@ struct bch_ioctl_data_progress {
__u64 sectors_total;
} __attribute__((packed, aligned(8)));
struct bch_ioctl_data_event {
__u8 type;
__u8 pad[7];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
};
} __attribute__((packed, aligned(8)));
struct bch_ioctl_dev_usage {
__u8 state;
__u8 alive;
@ -127,6 +240,19 @@ struct bch_ioctl_fs_usage {
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
};
/*
* BCH_IOCTL_USAGE: query filesystem disk space usage
*
* Returns disk space usage broken out by data type, number of replicas, and
* by component device
*
* @nr_devices - number of devices userspace allocated space for in @devs
*
* On success, @fs and @devs will be filled out appropriately and devs[i].alive
* will indicate if a device was present in that slot
*
* Returns -ERANGE if @nr_devices was too small
*/
struct bch_ioctl_usage {
__u16 nr_devices;
__u16 pad[3];
@ -135,6 +261,20 @@ struct bch_ioctl_usage {
struct bch_ioctl_dev_usage devs[0];
};
/*
* BCH_IOCTL_READ_SUPER: read filesystem superblock
*
* Equivalent to reading the superblock directly from the block device, except
* avoids racing with the kernel writing the superblock or having to figure out
* which block device to read
*
* @sb - buffer to read into
* @size - size of userspace allocated buffer
* @dev - device to read superblock for, if BCH_READ_DEV flag is
* specified
*
* Returns -ERANGE if buffer provided is too small
*/
struct bch_ioctl_read_super {
__u32 flags;
__u32 pad;
@ -143,10 +283,22 @@ struct bch_ioctl_read_super {
__u64 sb;
};
/*
* BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
* determine if disk is a (online) member - if so, returns device's index
*
* Returns -ENOENT if not found
*/
struct bch_ioctl_disk_get_idx {
__u64 dev;
};
/*
* BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
*
* @dev - member to resize
* @nbuckets - new number of buckets
*/
struct bch_ioctl_disk_resize {
__u32 flags;
__u32 pad;

View File

@ -13,8 +13,6 @@
void bch2_to_binary(char *, const u64 *, unsigned);
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
@ -590,25 +588,31 @@ BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
/* byte order helpers */
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
static inline unsigned high_word_offset(const struct bkey_format *f)
{
return f->key_u64s - 1;
}
#define high_bit_offset 0
#define nth_word(p, n) ((p) - (n))
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
static inline unsigned high_word_offset(const struct bkey_format *f)
{
return 0;
}
#define high_bit_offset KEY_PACKED_BITS_START
#define nth_word(p, n) ((p) + (n))
#else
#error edit for your odd byteorder.
#endif
#ifdef __LITTLE_ENDIAN
#define high_bit_offset 0
#define __high_word(u64s, k) ((k)->_data + (u64s) - 1)
#define nth_word(p, n) ((p) - (n))
#else
#define high_bit_offset KEY_PACKED_BITS_START
#define __high_word(u64s, k) ((k)->_data)
#define nth_word(p, n) ((p) + (n))
#endif
#define high_word(format, k) __high_word((format)->key_u64s, k)
#define high_word(f, k) ((k)->_data + high_word_offset(f))
#define next_word(p) nth_word(p, 1)
#define prev_word(p) nth_word(p, -1)

View File

@ -6,6 +6,7 @@
*/
#include "bcachefs.h"
#include "btree_cache.h"
#include "bset.h"
#include "eytzinger.h"
#include "util.h"
@ -438,6 +439,10 @@ void bch2_btree_keys_free(struct btree *b)
b->aux_data = NULL;
}
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
{
b->page_order = page_order;
@ -672,7 +677,7 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
* (and then the bits we want are at the high end, so we shift them
* back down):
*/
#ifdef __LITTLE_ENDIAN
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
v >>= f->exponent & 7;
#else
v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
@ -761,7 +766,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
* Then we calculate the actual shift value, from the start of the key
* (k->_data), to get the key bits starting at exponent:
*/
#ifdef __LITTLE_ENDIAN
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
EBUG_ON(shift + bits > b->format.key_u64s * 64);
@ -964,10 +969,14 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
void bch2_bset_init_next(struct btree *b, struct bset *i)
void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
memset(i, 0, sizeof(*i));

View File

@ -157,9 +157,6 @@ static inline bool btree_keys_expensive_checks(const struct btree *b)
#endif
}
struct btree_node_iter;
struct btree_node_iter_set;
enum bset_aux_tree_type {
BSET_NO_AUX_TREE,
BSET_RO_AUX_TREE,
@ -342,7 +339,8 @@ int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
void bch2_btree_keys_init(struct btree *, bool *);
void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
struct bkey_packed *);
@ -420,14 +418,6 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
/* Btree key iteration */
struct btree_node_iter {
u8 is_extents;
struct btree_node_iter_set {
u16 k, end;
} data[MAX_BSETS];
};
static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
bool is_extents)
{

View File

@ -554,7 +554,8 @@ out:
b->uncompacted_whiteout_u64s = 0;
bch2_btree_keys_init(b, &c->expensive_debug_checks);
bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
return b;
err:

View File

@ -27,6 +27,7 @@
#include <linux/kthread.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <trace/events/bcachefs.h>
struct range_checks {
@ -264,10 +265,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
gc_pos_set(c, gc_pos_btree_node(b));
if (max_stale > 32)
if (max_stale > 64)
bch2_btree_node_rewrite(c, &iter,
b->data->keys.seq,
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
else if (!btree_gc_rewrite_disabled(c) &&
(btree_gc_always_rewrite(c) || max_stale > 16))
@ -557,7 +559,7 @@ void bch2_gc(struct bch_fs *c)
out:
up_write(&c->gc_lock);
trace_gc_end(c);
bch2_time_stats_update(&c->btree_gc_time, start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
/*
* Wake up allocator in case it was waiting for buckets
@ -813,6 +815,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
{
struct btree_iter iter;
struct btree *b;
bool kthread = (current->flags & PF_KTHREAD) != 0;
unsigned i;
/* Sliding window of adjacent btree nodes */
@ -859,7 +862,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
lock_seq[0] = merge[0]->lock.state.seq;
if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
if (kthread && kthread_should_stop()) {
bch2_btree_iter_unlock(&iter);
return -ESHUTDOWN;
}
@ -958,13 +961,15 @@ static int bch2_gc_thread(void *arg)
void bch2_gc_thread_stop(struct bch_fs *c)
{
set_bit(BCH_FS_GC_STOPPING, &c->flags);
if (c->gc_thread)
kthread_stop(c->gc_thread);
struct task_struct *p;
p = c->gc_thread;
c->gc_thread = NULL;
clear_bit(BCH_FS_GC_STOPPING, &c->flags);
if (p) {
kthread_stop(p);
put_task_struct(p);
}
}
int bch2_gc_thread_start(struct bch_fs *c)
@ -973,12 +978,13 @@ int bch2_gc_thread_start(struct bch_fs *c)
BUG_ON(c->gc_thread);
p = kthread_create(bch2_gc_thread, c, "bcache_gc");
p = kthread_create(bch2_gc_thread, c, "bch_gc");
if (IS_ERR(p))
return PTR_ERR(p);
get_task_struct(p);
c->gc_thread = p;
wake_up_process(c->gc_thread);
wake_up_process(p);
return 0;
}

View File

@ -627,7 +627,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
if (sorting_entire_node)
bch2_time_stats_update(&c->btree_sort_time, start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
start_time);
/* Make sure we preserve bset journal_seq: */
for (t = b->set + start_idx; t < b->set + end_idx; t++)
@ -801,7 +802,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
&dst->format,
true);
bch2_time_stats_update(&c->btree_sort_time, start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
set_btree_bset_end(dst, dst->set);
@ -877,7 +878,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
bne = want_new_bset(c, b);
if (bne)
bch2_bset_init_next(b, &bne->keys);
bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);
@ -1382,7 +1383,7 @@ start:
}
}
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
bio_put(&rb->bio);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@ -1742,6 +1743,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
BUG_ON((b->will_make_reachable != 0) != !b->written);
BUG_ON(b->written >= c->opts.btree_node_size);
BUG_ON(b->written & (c->opts.block_size - 1));
BUG_ON(bset_written(b, btree_bset_last(b)));
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@ -1972,7 +1974,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
bch2_bset_init_next(b, &bne->keys);
bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);

View File

@ -133,7 +133,7 @@ do { \
\
six_unlock_read(&(_b)->lock); \
btree_node_wait_on_io(_b); \
six_lock_read(&(_b)->lock); \
btree_node_lock_type(c, b, SIX_LOCK_read); \
} \
} while (0)

View File

@ -42,25 +42,17 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
six_unlock_write(&b->lock);
}
void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
{
struct bch_fs *c = iter->c;
struct btree_iter *linked;
unsigned readers = 0;
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
if (six_trylock_write(&b->lock))
return;
for_each_linked_btree_iter(iter, linked)
if (linked->l[b->level].b == b &&
btree_node_read_locked(linked, b->level))
readers++;
if (likely(!readers)) {
six_lock_write(&b->lock);
} else {
/*
* Must drop our read locks before calling six_lock_write() -
* six_unlock() won't do wakeups until the reader count
@ -69,11 +61,10 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
*/
atomic64_sub(__SIX_VAL(read_lock, readers),
&b->lock.state.counter);
six_lock_write(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_write);
atomic64_add(__SIX_VAL(read_lock, readers),
&b->lock.state.counter);
}
}
bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{
@ -135,6 +126,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
struct btree_iter *iter,
enum six_lock_type type)
{
struct bch_fs *c = iter->c;
struct btree_iter *linked;
/* Can't have children locked before ancestors: */
@ -206,7 +198,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
}
}
six_lock_type(&b->lock, type);
__btree_node_lock_type(c, b, type);
return true;
}

View File

@ -4,72 +4,6 @@
#include <linux/dynamic_fault.h>
#include "btree_types.h"
#include "bset.h"
#define BTREE_ITER_SLOTS (1 << 0)
#define BTREE_ITER_INTENT (1 << 1)
#define BTREE_ITER_PREFETCH (1 << 2)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
#define BTREE_ITER_IS_EXTENTS (1 << 3)
/*
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
*/
#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
#define BTREE_ITER_ERROR (1 << 5)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
BTREE_ITER_NEED_PEEK = 1,
BTREE_ITER_NEED_RELOCK = 2,
BTREE_ITER_NEED_TRAVERSE = 3,
BTREE_ITER_END = 4,
};
/*
* @pos - iterator's current position
* @level - current btree depth
* @locks_want - btree level below which we start taking intent locks
* @nodes_locked - bitmask indicating which nodes in @nodes are locked
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
struct bch_fs *c;
struct bpos pos;
u8 flags;
unsigned uptodate:4;
enum btree_id btree_id:4;
unsigned level:4,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
struct btree_iter_level {
struct btree *b;
struct btree_node_iter iter;
} l[BTREE_MAX_DEPTH];
u32 lock_seq[BTREE_MAX_DEPTH];
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
/*
* Circular linked list of linked iterators: linked iterators share
* locks (e.g. two linked iterators may have the same node intent
* locked, or read and write locked, at the same time), and insertions
* through one iterator won't invalidate the other linked iterators.
*/
/* Must come last: */
struct btree_iter *next;
};
static inline void btree_iter_set_dirty(struct btree_iter *iter,
enum btree_iter_uptodate u)

View File

@ -98,6 +98,39 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
mark_btree_node_unlocked(iter, level);
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
{
switch (type) {
case SIX_LOCK_read:
return BCH_TIME_btree_lock_contended_read;
case SIX_LOCK_intent:
return BCH_TIME_btree_lock_contended_intent;
case SIX_LOCK_write:
return BCH_TIME_btree_lock_contended_write;
default:
BUG();
}
}
/*
* wrapper around six locks that just traces lock contended time
*/
static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
enum six_lock_type type)
{
u64 start_time = local_clock();
six_lock_type(&b->lock, type);
bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
}
static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
enum six_lock_type type)
{
if (!six_trylock_type(&b->lock, type))
__btree_node_lock_type(c, b, type);
}
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type);
@ -125,7 +158,17 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
{
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
if (!six_trylock_write(&b->lock))
__bch2_btree_node_lock_write(b, iter);
}
#endif /* _BCACHEFS_BTREE_LOCKING_H */

View File

@ -176,6 +176,79 @@ struct btree_cache {
struct closure_waitlist alloc_wait;
};
struct btree_node_iter {
u8 is_extents;
struct btree_node_iter_set {
u16 k, end;
} data[MAX_BSETS];
};
#define BTREE_ITER_SLOTS (1 << 0)
#define BTREE_ITER_INTENT (1 << 1)
#define BTREE_ITER_PREFETCH (1 << 2)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
#define BTREE_ITER_IS_EXTENTS (1 << 3)
/*
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
*/
#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
#define BTREE_ITER_ERROR (1 << 5)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
BTREE_ITER_NEED_PEEK = 1,
BTREE_ITER_NEED_RELOCK = 2,
BTREE_ITER_NEED_TRAVERSE = 3,
BTREE_ITER_END = 4,
};
/*
* @pos - iterator's current position
* @level - current btree depth
* @locks_want - btree level below which we start taking intent locks
* @nodes_locked - bitmask indicating which nodes in @nodes are locked
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
struct bch_fs *c;
struct bpos pos;
u8 flags;
unsigned uptodate:4;
enum btree_id btree_id:4;
unsigned level:4,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
struct btree_iter_level {
struct btree *b;
struct btree_node_iter iter;
} l[BTREE_MAX_DEPTH];
u32 lock_seq[BTREE_MAX_DEPTH];
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
/*
* Circular linked list of linked iterators: linked iterators share
* locks (e.g. two linked iterators may have the same node intent
* locked, or read and write locked, at the same time), and insertions
* through one iterator won't invalidate the other linked iterators.
*/
/* Must come last: */
struct btree_iter *next;
};
#define BTREE_FLAG(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \

View File

@ -237,7 +237,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
clear_btree_node_noevict(b);
six_lock_write(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_write);
bch2_btree_node_hash_remove(&c->btree_cache, b);
@ -622,7 +622,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
* b->will_make_reachable prevented it from being written, so
* write it now if it needs to be written:
*/
six_lock_read(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
six_unlock_read(&b->lock);
mutex_lock(&c->btree_interior_update_lock);
@ -647,8 +647,10 @@ static void btree_update_wait_on_journal(struct closure *cl)
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
if (ret < 0)
goto err;
if (!ret)
if (!ret) {
continue_at(cl, btree_update_wait_on_journal, system_wq);
return;
}
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
err:
@ -679,7 +681,7 @@ retry:
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
six_lock_read(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto retry;
}
@ -720,7 +722,7 @@ retry:
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
six_lock_read(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto retry;
}
@ -1456,7 +1458,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_iter_node_replace(iter, n2);
bch2_btree_iter_node_replace(iter, n1);
bch2_time_stats_update(&c->btree_split_time, start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
}
static void
@ -1795,8 +1797,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_node_write(c, n, SIX_LOCK_intent);
if (parent) {
bch2_btree_insert_node(as, parent, iter,
&keylist_single(&n->key));
bch2_keylist_add(&as->parent_keys, &n->key);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
} else {
bch2_btree_set_root(as, n, iter);
}

View File

@ -226,11 +226,30 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
return (void *) i > write_block(b);
}
static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
struct bset *i)
static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
struct btree *b,
void *end)
{
return round_up(bset_byte_offset(b, vstruct_end(i)),
block_bytes(c)) >> 9;
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s +
b->uncompacted_whiteout_u64s;
ssize_t total = c->opts.btree_node_size << 6;
return total - used;
}
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
struct btree *b)
{
ssize_t remaining = __bch_btree_u64s_remaining(c, b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
if (bset_written(b, btree_bset_last(b)))
return 0;
return remaining;
}
static inline unsigned btree_write_set_buffer(struct btree *b)
@ -246,20 +265,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
unsigned offset = max_t(unsigned, b->written << 9,
bset_byte_offset(b, vstruct_end(i)));
ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t)
(offset + sizeof(struct btree_node_entry) +
b->whiteout_u64s * sizeof(u64) +
b->uncompacted_whiteout_u64s * sizeof(u64));
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
EBUG_ON(offset > btree_bytes(c));
if ((unlikely(bset_written(b, i)) &&
remaining_space > block_bytes(c)) ||
(unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
remaining_space > btree_write_set_buffer(b)))
return (void *) b->data + offset;
if (unlikely(bset_written(b, i))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
return bne;
} else {
if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
return bne;
}
return NULL;
}
@ -285,23 +303,6 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
}
}
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
b->whiteout_u64s +
b->uncompacted_whiteout_u64s;
unsigned total = c->opts.btree_node_size << 6;
EBUG_ON(used > total);
if (bset_written(b, i))
return 0;
return total - used;
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)

View File

@ -108,7 +108,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
six_lock_read(&b->lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w &&
w->journal.pin_list == journal_seq_pin(j, seq)));

View File

@ -555,9 +555,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
return;
}
v = READ_ONCE(g->_mark.counter);
v = atomic64_read(&g->_mark.v);
do {
new.counter = old.counter = v;
new.v.counter = old.v.counter = v;
saturated = 0;
/*
@ -600,9 +600,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
g->_mark = new;
break;
}
} while ((v = cmpxchg(&g->_mark.counter,
old.counter,
new.counter)) != old.counter);
} while ((v = atomic64_cmpxchg(&g->_mark.v,
old.v.counter,
new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, old, new);
@ -957,7 +957,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
kvpfree(ca->buckets_dirty,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(ca->buckets, sizeof(struct bucket_array) +
kvpfree(rcu_dereference_protected(ca->buckets, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage_percpu);

View File

@ -16,15 +16,15 @@
#define bucket_cmpxchg(g, new, expr) \
({ \
u64 _v = READ_ONCE((g)->_mark.counter); \
u64 _v = atomic64_read(&(g)->_mark.v); \
struct bucket_mark _old; \
\
do { \
(new).counter = _old.counter = _v; \
(new).v.counter = _old.v.counter = _v; \
expr; \
} while ((_v = cmpxchg(&(g)->_mark.counter, \
_old.counter, \
(new).counter)) != _old.counter);\
} while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
_old.v.counter, \
(new).v.counter)) != _old.v.counter);\
_old; \
})

View File

@ -6,7 +6,7 @@
struct bucket_mark {
union {
struct {
u64 counter;
atomic64_t v;
};
struct {

View File

@ -54,6 +54,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
return ca;
}
#if 0
static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
{
struct bch_ioctl_assemble arg;
@ -127,14 +128,17 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
return 0;
}
#endif
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
switch (cmd) {
#if 0
case BCH_IOCTL_ASSEMBLE:
return bch2_ioctl_assemble(arg);
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
#endif
default:
return -ENOTTY;
}
@ -148,6 +152,7 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
sizeof(c->sb.user_uuid));
}
#if 0
static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
{
if (arg.flags || arg.pad)
@ -161,6 +166,7 @@ static long bch2_ioctl_stop(struct bch_fs *c)
bch2_fs_stop(c);
return 0;
}
#endif
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
{
@ -294,18 +300,19 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
{
struct bch_data_ctx *ctx = file->private_data;
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_progress p = {
.data_type = ctx->stats.data_type,
.btree_id = ctx->stats.iter.btree_id,
.pos = ctx->stats.iter.pos,
.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
.p.btree_id = ctx->stats.iter.btree_id,
.p.pos = ctx->stats.iter.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
};
if (len != sizeof(p))
if (len < sizeof(e))
return -EINVAL;
return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p);
return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
@ -419,7 +426,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (ca->dev_idx >= arg.nr_devices) {
percpu_ref_put(&ca->ref);
return -ENOSPC;
return -ERANGE;
}
if (percpu_ref_tryget(&ca->io_ref)) {
@ -539,10 +546,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return -EPERM;
switch (cmd) {
#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
case BCH_IOCTL_STOP:
return bch2_ioctl_stop(c);
#endif
case BCH_IOCTL_READ_SUPER:
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
case BCH_IOCTL_DISK_GET_IDX:

View File

@ -421,7 +421,7 @@ static struct bch_csum bch2_checksum_merge(unsigned type,
BUG_ON(!bch2_checksum_mergeable(type));
while (b_len) {
unsigned b = min(b_len, PAGE_SIZE);
unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
a.lo = bch2_checksum_update(type, a.lo,
page_address(ZERO_PAGE(0)), b);

View File

@ -42,7 +42,8 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
}
struct io_clock_wait {
struct io_timer timer;
struct io_timer io_timer;
struct timer_list cpu_timer;
struct task_struct *task;
int expired;
};
@ -50,7 +51,16 @@ struct io_clock_wait {
static void io_clock_wait_fn(struct io_timer *timer)
{
struct io_clock_wait *wait = container_of(timer,
struct io_clock_wait, timer);
struct io_clock_wait, io_timer);
wait->expired = 1;
wake_up_process(wait->task);
}
static void io_clock_cpu_timeout(struct timer_list *timer)
{
struct io_clock_wait *wait = container_of(timer,
struct io_clock_wait, cpu_timer);
wait->expired = 1;
wake_up_process(wait->task);
@ -61,35 +71,38 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
struct io_clock_wait wait;
/* XXX: calculate sleep time rigorously */
wait.timer.expire = until;
wait.timer.fn = io_clock_wait_fn;
wait.io_timer.expire = until;
wait.io_timer.fn = io_clock_wait_fn;
wait.task = current;
wait.expired = 0;
bch2_io_timer_add(clock, &wait.timer);
bch2_io_timer_add(clock, &wait.io_timer);
schedule();
bch2_io_timer_del(clock, &wait.timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
/*
* _only_ to be used from a kthread
*/
void bch2_kthread_io_clock_wait(struct io_clock *clock,
unsigned long until)
unsigned long io_until,
unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait;
/* XXX: calculate sleep time rigorously */
wait.timer.expire = until;
wait.timer.fn = io_clock_wait_fn;
wait.io_timer.expire = io_until;
wait.io_timer.fn = io_clock_wait_fn;
wait.task = current;
wait.expired = 0;
bch2_io_timer_add(clock, &wait.timer);
bch2_io_timer_add(clock, &wait.io_timer);
timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop())
if (kthread && kthread_should_stop())
break;
if (wait.expired)
@ -100,7 +113,9 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
}
__set_current_state(TASK_RUNNING);
bch2_io_timer_del(clock, &wait.timer);
del_singleshot_timer_sync(&wait.cpu_timer);
destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
static struct io_timer *get_expired_timer(struct io_clock *clock,

View File

@ -3,7 +3,8 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
unsigned long);
void bch2_increment_clock(struct bch_fs *, unsigned, int);
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);

View File

@ -480,7 +480,7 @@ static const unsigned bch2_compression_opt_to_feature[] = {
#undef BCH_FEATURE_NONE
int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
{
int ret = 0;
@ -529,26 +529,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
mempool_exit(&c->compression_bounce[READ]);
}
static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)pool_data;
return kvpmalloc(size, gfp_mask);
}
void mempool_kvpfree(void *element, void *pool_data)
{
size_t size = (size_t)pool_data;
kvpfree(element, size);
}
static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return !mempool_initialized(pool)
? mempool_init(pool, min_nr, mempool_kvpmalloc,
mempool_kvpfree, (void *) size)
: 0;
}
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t max_extent = c->sb.encoded_extent_max << 9;
@ -611,6 +591,9 @@ have_compressed:
if (i->decompress_workspace)
decompress_workspace_needed = true;
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
ret = mempool_init_kvpmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace);

View File

@ -16,7 +16,7 @@ static int group_cmp(const void *_l, const void *_r)
strncmp(l->label, r->label, sizeof(l->label));
}
const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_disk_groups *groups =
@ -162,7 +162,8 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
}
}
old_g = c->disk_groups;
old_g = rcu_dereference_protected(c->disk_groups,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->disk_groups, cpu_g);
if (old_g)
kfree_rcu(old_g, rcu);
@ -193,6 +194,36 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
}
}
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_NULL:
return false;
case TARGET_DEV:
return dev == t.dev;
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g;
const struct bch_devs_mask *m;
bool ret;
rcu_read_lock();
g = rcu_dereference(c->disk_groups);
m = t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
ret = m ? test_bit(dev, m->d) : false;
rcu_read_unlock();
return ret;
}
default:
BUG();
}
}
static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
unsigned parent,
const char *name, unsigned namelen)

View File

@ -53,34 +53,8 @@ static inline struct target target_decode(unsigned target)
return (struct target) { .type = TARGET_NULL };
}
static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_NULL:
return false;
case TARGET_DEV:
return ca->dev_idx == t.dev;
case TARGET_GROUP:
return ca->mi.group && ca->mi.group - 1 == t.group;
default:
BUG();
}
}
static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
{
bool ret;
rcu_read_lock();
ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
rcu_read_unlock();
return ret;
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);

View File

@ -144,7 +144,7 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.group &&
ca->mi.group - 1 == group)
@ -159,13 +159,11 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
{
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (dev_in_target(ca, target) &&
(!ptr->cached || !ptr_stale(ca, ptr)))
extent_for_each_ptr(e, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
(!ptr->cached ||
!ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
return ptr;
}
return NULL;
}
@ -732,7 +730,7 @@ err:
bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
"gen %i mark %08x",
err, buf, PTR_BUCKET_NR(ca, ptr),
mark.gen, (unsigned) mark.counter);
mark.gen, (unsigned) mark.v.counter);
}
void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
@ -2024,7 +2022,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
int n = bch2_extent_ptr_durability(c, ptr);
if (n && n <= extra &&
!dev_in_target(c->devs[ptr->dev], target)) {
!bch2_dev_in_target(c, ptr->dev, target)) {
ptr->cached = true;
extra -= n;
}

View File

@ -278,24 +278,38 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
.uncompressed_size = k->size,
.live_size = k->size,
};
case BCH_EXTENT_CRC32:
return (struct bch_extent_crc_unpacked) {
case BCH_EXTENT_CRC32: {
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
.csum.lo = (__force __le64) crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
return (struct bch_extent_crc_unpacked) {
*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
memcpy(&ret.csum.lo, &crc->crc32.csum,
sizeof(crc->crc32.csum));
return ret;
}
case BCH_EXTENT_CRC64: {
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
.csum.lo = (__force __le64) crc->crc64.csum_lo,
.csum.hi = (__force __le64) crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return (struct bch_extent_crc_unpacked) {
*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
return ret;
}
case BCH_EXTENT_CRC128: {
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc128),
.nonce = crc->crc128.nonce,
.csum = crc->crc128.csum,
};
return ret;
}
default:
BUG();
}

View File

@ -678,7 +678,7 @@ static void bch2_clear_page_bits(struct page *page)
if (!PagePrivate(page))
return;
s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
s.v = xchg(&page_state(page)->v, 0);
ClearPagePrivate(page);
if (s.dirty_sectors)
@ -1020,12 +1020,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
extent_for_each_ptr_crc(e, ptr, crc)
want_full_extent |= !!crc.csum_type |
!!crc.compression_type;
extent_for_each_crc(e, crc, i)
want_full_extent |= ((crc.csum_type != 0) |
(crc.compression_type != 0));
}
readpage_bio_extend(readpages_iter,
@ -1850,8 +1850,7 @@ err_wait_io:
dio->loop = true;
if (!dio->sync) {
continue_at_noreturn(&dio->cl,
bch2_dio_write_loop_async, NULL);
continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
return -EIOCBQUEUED;
}

View File

@ -610,7 +610,8 @@ static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
{
if (nr >= b->size) {
size_t new_size = max(max(PAGE_SIZE * 8,
size_t new_size = max_t(size_t, max_t(size_t,
PAGE_SIZE * 8,
b->size * 2),
nr + 1);
void *n;
@ -642,7 +643,7 @@ struct pathbuf {
static int path_down(struct pathbuf *p, u64 inum)
{
if (p->nr == p->size) {
size_t new_size = max(256UL, p->size * 2);
size_t new_size = max_t(size_t, 256UL, p->size * 2);
void *n = krealloc(p->entries,
new_size * sizeof(p->entries[0]),
GFP_KERNEL);

View File

@ -21,10 +21,10 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "rebalance.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "tier.h"
#include <linux/blkdev.h>
#include <linux/random.h>
@ -269,7 +269,7 @@ static void bch2_write_done(struct closure *cl)
percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->data_write_time, op->start_time);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
closure_return(cl);
}
@ -842,20 +842,24 @@ again:
} while (ret);
continue_at(cl, bch2_write_index, index_update_wq(op));
return;
err:
op->error = ret;
continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
? bch2_write_index
: bch2_write_done, index_update_wq(op));
return;
flush_io:
closure_sync(cl);
if (!bch2_keylist_empty(&op->insert_keys)) {
__bch2_write_index(op);
if (op->error)
if (op->error) {
continue_at_nobarrier(cl, bch2_write_done, NULL);
return;
}
}
goto again;
@ -901,6 +905,7 @@ void bch2_write(struct closure *cl)
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(c, &op->res);
closure_return(cl);
return;
}
bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
@ -974,7 +979,8 @@ static void promote_done(struct closure *cl)
container_of(cl, struct promote_op, cl);
struct bch_fs *c = op->write.op.c;
bch2_time_stats_update(&c->data_promote_time, op->start_time);
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
promote_free(c, op);
@ -1048,7 +1054,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
(*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
bch2_bio_map(&(*rbio)->bio, NULL);
if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
goto err;
(*rbio)->bounce = true;
@ -1174,7 +1180,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time);
bio_endio(&rbio->bio);
}
@ -1486,7 +1493,7 @@ csum_err:
}
bch2_dev_io_error(ca,
"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);

View File

@ -365,6 +365,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);

View File

@ -324,7 +324,7 @@ struct jset_entry_ops {
struct jset_entry *, int);
};
const struct jset_entry_ops bch2_jset_entry_ops[] = {
static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#define x(f, nr) \
[BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
.validate = journal_entry_validate_##f, \
@ -696,6 +696,7 @@ out:
kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
err:
mutex_lock(&jlist->lock);
jlist->ret = ret;
@ -716,19 +717,6 @@ void bch2_journal_entries_free(struct list_head *list)
}
}
static inline bool journal_has_keys(struct list_head *list)
{
struct journal_replay *i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
list_for_each_entry(i, list, list)
for_each_jset_key(k, _n, entry, &i->j)
return true;
return false;
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
@ -737,8 +725,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_entry_pin_list *p;
struct bch_dev *ca;
u64 cur_seq, end_seq, seq;
unsigned iter, keys = 0, entries = 0;
size_t nr;
unsigned iter;
size_t entries = 0;
u64 nr, keys = 0;
bool degraded = false;
int ret = 0;
@ -772,9 +761,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
return BCH_FSCK_REPAIR_IMPOSSIBLE;
}
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay");
list_for_each_entry(i, list, list) {
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
@ -797,15 +783,27 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
}
}
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
for_each_jset_key(k, _n, entry, &i->j)
keys++;
}
i = list_last_entry(list, struct journal_replay, list);
nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
fsck_err_on(c->sb.clean && (keys || nr > 1), c,
"filesystem marked clean but journal not empty (%llu keys in %llu entries)",
keys, nr);
if (nr > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
return -ENOMEM;
}
}
@ -844,8 +842,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
bool blacklisted;
mutex_lock(&j->blacklist_lock);
@ -867,13 +863,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
journal_last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
for_each_jset_key(k, _n, entry, &i->j)
keys++;
entries++;
}
bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
keys, entries, journal_cur_seq(j));
fsck_err:
return ret;
@ -1361,6 +1354,7 @@ void bch2_journal_write(struct closure *cl)
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
return;
}
/*
@ -1417,6 +1411,7 @@ no_io:
ptr->offset += sectors;
continue_at(cl, journal_write_done, system_highpri_wq);
return;
err:
bch2_inconsistent_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);

View File

@ -247,7 +247,7 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
if (!bl->nr_entries ||
is_power_of_2(bl->nr_entries)) {
n = krealloc(bl->entries,
max(bl->nr_entries * 2, 8UL) * sizeof(*n),
max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
GFP_KERNEL);
if (!n) {
ret = -ENOMEM;

View File

@ -55,9 +55,6 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
_k != (_keylist)->top; \
_k = bkey_next(_k))
#define keylist_single(k) \
((struct keylist) { .keys = k, .top = bkey_next(k) })
static inline u64 keylist_sectors(struct keylist *keys)
{
struct bkey_i *k;

View File

@ -306,8 +306,11 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (likely(!io->rbio.bio.bi_status &&
!io->rbio.hole)) {
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
closure_return_with_destructor(cl, move_free);
return;
}
bch2_migrate_read_done(&io->write, &io->rbio);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
@ -315,9 +318,6 @@ static void move_write(struct closure *cl)
continue_at(cl, move_write_done, NULL);
}
closure_return_with_destructor(cl, move_free);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
{
struct moving_io *io =
@ -411,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c,
io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
bch2_bio_map(&io->write.op.wbio.bio, NULL);
if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
goto err_free;
io->rbio.opts = io_opts;

View File

@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "io_types.h"
#include "move_types.h"
struct bch_read_bio;
struct moving_context;
@ -48,16 +49,6 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
enum bkey_type, struct bkey_s_c_extent,
struct bch_io_opts *, struct data_opts *);
struct bch_move_stats {
enum bch_data_type data_type;
struct btree_iter iter;
atomic64_t keys_moved;
atomic64_t sectors_moved;
atomic64_t sectors_seen;
atomic64_t sectors_raced;
};
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
struct write_point_specifier,
struct bpos, struct bpos,

14
libbcachefs/move_types.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
struct bch_move_stats {
enum bch_data_type data_type;
struct btree_iter iter;
atomic64_t keys_moved;
atomic64_t sectors_moved;
atomic64_t sectors_seen;
atomic64_t sectors_raced;
};
#endif /* _BCACHEFS_MOVE_TYPES_H */

View File

@ -241,7 +241,8 @@ static int bch2_copygc_thread(void *arg)
ca->mi.bucket_size;
if (available > reserve) {
next = last + available - reserve;
bch2_kthread_io_clock_wait(clock, next);
bch2_kthread_io_clock_wait(clock, next,
MAX_SCHEDULE_TIMEOUT);
continue;
}
@ -252,7 +253,8 @@ static int bch2_copygc_thread(void *arg)
fragmented = usage.sectors_fragmented;
if (fragmented < reserve) {
next = last + reserve - fragmented;
bch2_kthread_io_clock_wait(clock, next);
bch2_kthread_io_clock_wait(clock, next,
MAX_SCHEDULE_TIMEOUT);
continue;
}

341
libbcachefs/rebalance.c Normal file
View File

@ -0,0 +1,341 @@
#include "bcachefs.h"
#include "alloc.h"
#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
#include "extents.h"
#include "io.h"
#include "move.h"
#include "rebalance.h"
#include "super-io.h"
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
#include <trace/events/bcachefs.h>
static inline bool rebalance_ptr_pred(struct bch_fs *c,
const struct bch_extent_ptr *ptr,
struct bch_extent_crc_unpacked crc,
struct bch_io_opts *io_opts)
{
if (io_opts->background_target &&
!bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
!ptr->cached)
return true;
if (io_opts->background_compression &&
crc.compression_type !=
bch2_compression_opt_to_type[io_opts->background_compression])
return true;
return false;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
struct bkey_s_c_extent e;
if (!bkey_extent_is_data(k.k))
return;
if (!io_opts->background_target &&
!io_opts->background_compression)
return;
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_crc(e, ptr, crc)
if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (atomic64_add_return(crc.compressed_size,
&ca->rebalance_work) ==
crc.compressed_size)
rebalance_wakeup(c);
}
}
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
{
if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
sectors)
rebalance_wakeup(c);
}
static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
/* Make sure we have room to add a new pointer: */
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
BKEY_EXTENT_VAL_U64s_MAX)
return DATA_SKIP;
extent_for_each_ptr_crc(e, ptr, crc)
if (rebalance_ptr_pred(c, ptr, crc, io_opts))
goto found;
return DATA_SKIP;
found:
data_opts->target = io_opts->background_target;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
}
struct rebalance_work {
int dev_most_full_idx;
unsigned dev_most_full_percent;
u64 dev_most_full_work;
u64 dev_most_full_capacity;
u64 total_work;
};
static void rebalance_work_accumulate(struct rebalance_work *w,
u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
{
unsigned percent_full;
u64 work = dev_work + unknown_dev;
if (work < dev_work || work < unknown_dev)
work = U64_MAX;
work = min(work, capacity);
percent_full = div_u64(work * 100, capacity);
if (percent_full >= w->dev_most_full_percent) {
w->dev_most_full_idx = idx;
w->dev_most_full_percent = percent_full;
w->dev_most_full_work = work;
w->dev_most_full_capacity = capacity;
}
if (w->total_work + dev_work >= w->total_work &&
w->total_work + dev_work >= dev_work)
w->total_work += dev_work;
}
static struct rebalance_work rebalance_work(struct bch_fs *c)
{
struct bch_dev *ca;
struct rebalance_work ret = { .dev_most_full_idx = -1 };
u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
unsigned i;
for_each_online_member(ca, c, i)
rebalance_work_accumulate(&ret,
atomic64_read(&ca->rebalance_work),
unknown_dev,
bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket),
i);
rebalance_work_accumulate(&ret,
unknown_dev, 0, c->capacity, -1);
return ret;
}
static void rebalance_work_reset(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i)
atomic64_set(&ca->rebalance_work, 0);
atomic64_set(&c->rebalance.work_unknown_dev, 0);
}
static unsigned long curr_cputime(void)
{
u64 utime, stime;
task_cputime_adjusted(current, &utime, &stime);
return nsecs_to_jiffies(utime + stime);
}
static int bch2_rebalance_thread(void *arg)
{
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
unsigned long io_start;
long throttle;
set_freezable();
io_start = atomic_long_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
while (!kthread_wait_freezable(r->enabled)) {
start = jiffies;
cputime = curr_cputime();
prev_run_time = start - prev_start;
prev_run_cputime = cputime - prev_cputime;
w = rebalance_work(c);
BUG_ON(!w.dev_most_full_capacity);
if (!w.total_work) {
r->state = REBALANCE_WAITING;
kthread_wait_freezable(rebalance_work(c).total_work);
continue;
}
/*
* If there isn't much work to do, throttle cpu usage:
*/
throttle = prev_run_cputime * 100 /
max(1U, w.dev_most_full_percent) -
prev_run_time;
if (w.dev_most_full_percent < 20 && throttle > 0) {
r->state = REBALANCE_THROTTLED;
r->throttled_until_iotime = io_start +
div_u64(w.dev_most_full_capacity *
(20 - w.dev_most_full_percent),
50);
r->throttled_until_cputime = start + throttle;
bch2_kthread_io_clock_wait(clock,
r->throttled_until_iotime,
throttle);
continue;
}
/* minimum 1 mb/sec: */
r->pd.rate.rate =
max_t(u64, 1 << 11,
r->pd.rate.rate *
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
io_start = atomic_long_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
r->state = REBALANCE_RUNNING;
memset(&r->move_stats, 0, sizeof(r->move_stats));
rebalance_work_reset(c);
bch2_move_data(c,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
writepoint_ptr(&c->rebalance_write_point),
POS_MIN, POS_MAX,
rebalance_pred, NULL,
&r->move_stats);
}
return 0;
}
ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
{
char *out = buf, *end = out + PAGE_SIZE;
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
char h1[21], h2[21];
bch2_hprint(h1, w.dev_most_full_work << 9);
bch2_hprint(h2, w.dev_most_full_capacity << 9);
out += scnprintf(out, end - out,
"fullest_dev (%i):\t%s/%s\n",
w.dev_most_full_idx, h1, h2);
bch2_hprint(h1, w.total_work << 9);
bch2_hprint(h2, c->capacity << 9);
out += scnprintf(out, end - out,
"total work:\t\t%s/%s\n",
h1, h2);
out += scnprintf(out, end - out,
"rate:\t\t\t%u\n",
r->pd.rate.rate);
switch (r->state) {
case REBALANCE_WAITING:
out += scnprintf(out, end - out, "waiting\n");
break;
case REBALANCE_THROTTLED:
bch2_hprint(h1,
(r->throttled_until_iotime -
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
out += scnprintf(out, end - out,
"throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
break;
case REBALANCE_RUNNING:
out += scnprintf(out, end - out, "running\n");
out += scnprintf(out, end - out, "pos %llu:%llu\n",
r->move_stats.iter.pos.inode,
r->move_stats.iter.pos.offset);
break;
}
return out - buf;
}
void bch2_rebalance_stop(struct bch_fs *c)
{
struct task_struct *p;
c->rebalance.pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&c->rebalance.pd.rate);
p = rcu_dereference_protected(c->rebalance.thread, 1);
c->rebalance.thread = NULL;
if (p) {
/* for sychronizing with rebalance_wakeup() */
synchronize_rcu();
kthread_stop(p);
put_task_struct(p);
}
}
int bch2_rebalance_start(struct bch_fs *c)
{
struct task_struct *p;
if (c->opts.nochanges)
return 0;
p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
if (IS_ERR(p))
return PTR_ERR(p);
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
wake_up_process(p);
return 0;
}
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}

View File

@ -1,12 +1,14 @@
#ifndef _BCACHEFS_TIER_H
#define _BCACHEFS_TIER_H
#ifndef _BCACHEFS_REBALANCE_H
#define _BCACHEFS_REBALANCE_H
#include "rebalance_types.h"
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
rcu_read_lock();
p = rcu_dereference(c->rebalance_thread);
p = rcu_dereference(c->rebalance.thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
@ -16,8 +18,10 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
struct bch_io_opts *);
void bch2_rebalance_add_work(struct bch_fs *, u64);
ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
void bch2_fs_rebalance_init(struct bch_fs *);
#endif /* _BCACHEFS_TIER_H */
#endif /* _BCACHEFS_REBALANCE_H */

View File

@ -0,0 +1,26 @@
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
#include "move_types.h"
enum rebalance_state {
REBALANCE_WAITING,
REBALANCE_THROTTLED,
REBALANCE_RUNNING,
};
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
struct bch_pd_controller pd;
atomic64_t work_unknown_dev;
enum rebalance_state state;
unsigned long throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */

View File

@ -146,6 +146,8 @@ struct six_lock_waiter {
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
static inline int six_can_spin_on_owner(struct six_lock *lock)
{
struct task_struct *owner;
@ -257,6 +259,15 @@ fail:
return false;
}
#else /* CONFIG_LOCK_SPIN_ON_OWNER */
static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
{
return false;
}
#endif
noinline
static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
{

View File

@ -624,7 +624,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
bio->bi_iter.bi_size =
roundup(vstruct_bytes(sb),
roundup((size_t) vstruct_bytes(sb),
bdev_logical_block_size(ca->disk_sb.bdev));
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;

View File

@ -73,11 +73,6 @@ static inline __u64 jset_magic(struct bch_fs *c)
return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
}
static inline __u64 pset_magic(struct bch_fs *c)
{
return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC);
}
static inline __u64 bset_magic(struct bch_fs *c)
{
return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
@ -136,4 +131,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
};
}
size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
struct bch_sb_field *);
#endif /* _BCACHEFS_SUPER_IO_H */

View File

@ -33,11 +33,11 @@
#include "migrate.h"
#include "movinggc.h"
#include "quota.h"
#include "rebalance.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
#include "tier.h"
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
@ -398,10 +398,10 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
#define BCH_TIME_STAT(name) \
bch2_time_stats_exit(&c->name##_time);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
unsigned i;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
@ -565,10 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
init_rwsem(&c->gc_lock);
#define BCH_TIME_STAT(name) \
bch2_time_stats_init(&c->name##_time);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
bch2_fs_allocator_init(c);
bch2_fs_rebalance_init(c);
@ -592,14 +590,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
seqcount_init(&c->gc_pos_lock);
c->copy_gc_enabled = 1;
c->rebalance_enabled = 1;
c->rebalance_percent = 10;
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
c->journal.write_time = &c->journal_write_time;
c->journal.delay_time = &c->journal_delay_time;
c->journal.blocked_time = &c->journal_blocked_time;
c->journal.flush_seq_time = &c->journal_flush_seq_time;
c->journal.write_time = &c->times[BCH_TIME_journal_write];
c->journal.delay_time = &c->times[BCH_TIME_journal_delay];
c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
@ -647,7 +644,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||

View File

@ -24,9 +24,9 @@
#include "keylist.h"
#include "move.h"
#include "opts.h"
#include "rebalance.h"
#include "replicas.h"
#include "super-io.h"
#include "tier.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@ -183,8 +183,8 @@ rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
rw_attribute(rebalance_enabled);
rw_attribute(rebalance_percent);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_work);
rw_attribute(promote_whole_extents);
rw_attribute(pd_controllers_update_seconds);
@ -198,11 +198,11 @@ read_attribute(data_replicas_have);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
#define BCH_TIME_STAT(_name) \
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = S_IRUGO };
BCH_TIME_STATS()
#undef BCH_TIME_STAT
#undef x
static struct attribute sysfs_state_rw = {
.name = "state",
@ -340,9 +340,11 @@ SHOW(bch2_fs)
sysfs_print(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
sysfs_print(rebalance_percent, c->rebalance_percent);
sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
if (attr == &sysfs_rebalance_work)
return bch2_rebalance_work_show(c, buf);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@ -404,7 +406,7 @@ STORE(__bch2_fs)
}
if (attr == &sysfs_rebalance_enabled) {
ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
?: (ssize_t) size;
rebalance_wakeup(c);
@ -413,9 +415,7 @@ STORE(__bch2_fs)
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_strtoul(rebalance_percent, c->rebalance_percent);
sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
@ -474,7 +474,6 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
&sysfs_rebalance_percent,
&sysfs_promote_whole_extents,
&sysfs_compression_stats,
@ -513,8 +512,11 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_prune_cache,
&sysfs_copy_gc_enabled,
&sysfs_rebalance_enabled,
&sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@ -613,11 +615,12 @@ SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
#define BCH_TIME_STAT(name) \
#define x(name) \
if (attr == &sysfs_time_stat_##name) \
return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
buf, PAGE_SIZE);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
#undef x
return 0;
}
@ -629,10 +632,10 @@ STORE(bch2_fs_time_stats)
SYSFS_OPS(bch2_fs_time_stats);
struct attribute *bch2_fs_time_stats_files[] = {
#define BCH_TIME_STAT(name) \
#define x(name) \
&sysfs_time_stat_##name,
BCH_TIME_STATS()
#undef BCH_TIME_STAT
#undef x
NULL
};

View File

@ -1,259 +0,0 @@
#include "bcachefs.h"
#include "alloc.h"
#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
#include "extents.h"
#include "io.h"
#include "move.h"
#include "super-io.h"
#include "tier.h"
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
#include <trace/events/bcachefs.h>
static inline bool rebalance_ptr_pred(struct bch_fs *c,
const struct bch_extent_ptr *ptr,
struct bch_extent_crc_unpacked crc,
struct bch_io_opts *io_opts)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (io_opts->background_target &&
!dev_in_target(ca, io_opts->background_target) &&
!ptr->cached)
return true;
if (io_opts->background_compression &&
crc.compression_type !=
bch2_compression_opt_to_type[io_opts->background_compression])
return true;
return false;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
struct bkey_s_c_extent e;
if (!bkey_extent_is_data(k.k))
return;
if (!io_opts->background_target &&
!io_opts->background_compression)
return;
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_crc(e, ptr, crc)
if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (!atomic64_add_return(crc.compressed_size,
&ca->rebalance_work))
rebalance_wakeup(c);
}
}
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
{
if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
rebalance_wakeup(c);
}
static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
/* Make sure we have room to add a new pointer: */
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
BKEY_EXTENT_VAL_U64s_MAX)
return DATA_SKIP;
extent_for_each_ptr_crc(e, ptr, crc)
if (rebalance_ptr_pred(c, ptr, crc, io_opts))
goto found;
return DATA_SKIP;
found:
data_opts->target = io_opts->background_target;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
}
struct rebalance_work {
unsigned dev_most_full_percent;
u64 dev_most_full_work;
u64 dev_most_full_capacity;
u64 total_work;
};
static struct rebalance_work rebalance_work(struct bch_fs *c)
{
struct bch_dev *ca;
struct rebalance_work ret = { 0 };
unsigned i;
for_each_online_member(ca, c, i) {
u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket);
u64 work = atomic64_read(&ca->rebalance_work) +
atomic64_read(&c->rebalance_work_unknown_dev);
unsigned percent_full = div_u64(work * 100, capacity);
if (percent_full > ret.dev_most_full_percent) {
ret.dev_most_full_percent = percent_full;
ret.dev_most_full_work = work;
ret.dev_most_full_capacity = capacity;
}
ret.total_work += atomic64_read(&ca->rebalance_work);
}
ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
return ret;
}
static void rebalance_work_reset(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i)
atomic64_set(&ca->rebalance_work, 0);
atomic64_set(&c->rebalance_work_unknown_dev, 0);
}
static unsigned long curr_cputime(void)
{
u64 utime, stime;
task_cputime_adjusted(current, &utime, &stime);
return nsecs_to_jiffies(utime + stime);
}
static int bch2_rebalance_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
set_freezable();
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
while (!kthread_wait_freezable(c->rebalance_enabled)) {
struct bch_move_stats move_stats = { 0 };
w = rebalance_work(c);
start = jiffies;
cputime = curr_cputime();
prev_run_time = start - prev_start;
prev_run_cputime = cputime - prev_cputime;
if (!w.total_work) {
kthread_wait_freezable(rebalance_work(c).total_work);
continue;
}
if (w.dev_most_full_percent < 20 &&
prev_run_cputime * 5 > prev_run_time) {
if (w.dev_most_full_capacity) {
bch2_kthread_io_clock_wait(clock,
atomic_long_read(&clock->now) +
div_u64(w.dev_most_full_capacity, 5));
} else {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop())
break;
schedule_timeout(prev_run_cputime * 5 -
prev_run_time);
continue;
}
}
/* minimum 1 mb/sec: */
c->rebalance_pd.rate.rate =
max_t(u64, 1 << 11,
c->rebalance_pd.rate.rate *
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
rebalance_work_reset(c);
bch2_move_data(c, &c->rebalance_pd.rate,
writepoint_ptr(&c->rebalance_write_point),
POS_MIN, POS_MAX,
rebalance_pred, NULL,
&move_stats);
}
return 0;
}
void bch2_rebalance_stop(struct bch_fs *c)
{
struct task_struct *p;
c->rebalance_pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&c->rebalance_pd.rate);
p = c->rebalance_thread;
c->rebalance_thread = NULL;
if (p) {
/* for sychronizing with rebalance_wakeup() */
synchronize_rcu();
kthread_stop(p);
put_task_struct(p);
}
}
int bch2_rebalance_start(struct bch_fs *c)
{
struct task_struct *p;
if (c->opts.nochanges)
return 0;
p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
if (IS_ERR(p))
return PTR_ERR(p);
get_task_struct(p);
rcu_assign_pointer(c->rebalance_thread, p);
wake_up_process(c->rebalance_thread);
return 0;
}
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance_pd);
atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
}

View File

@ -203,7 +203,7 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
void bch2_quantiles_update(struct quantiles *q, u64 v)
static void bch2_quantiles_update(struct quantiles *q, u64 v)
{
unsigned i = 0;
@ -569,6 +569,23 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
{
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
__free_page(bv->bv_page);
return -ENOMEM;
}
}
return 0;
}
size_t bch2_rand_range(size_t max)
{
size_t rand;
@ -771,20 +788,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
}
}
void mempool_free_vp(void *element, void *pool_data)
static void mempool_free_vp(void *element, void *pool_data)
{
size_t size = (size_t) pool_data;
vpfree(element, size);
}
void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t) pool_data;
return vpmalloc(size, gfp_mask);
}
int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return size < PAGE_SIZE
? mempool_init_kmalloc_pool(pool, min_nr, size)
: mempool_init(pool, min_nr, mempool_alloc_vp,
mempool_free_vp, (void *) size);
}
#if 0
void eytzinger1_test(void)
{

View File

@ -68,9 +68,9 @@ struct closure;
#define __flatten
#endif
#ifdef __LITTLE_ENDIAN
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define CPU_BIG_ENDIAN 0
#else
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define CPU_BIG_ENDIAN 1
#endif
@ -113,14 +113,7 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
: vpmalloc(size, gfp_mask);
}
void mempool_free_vp(void *element, void *pool_data);
void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
{
return mempool_init(pool, min_nr, mempool_alloc_vp,
mempool_free_vp, (void *) size);
}
int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
#define HEAP(type) \
struct { \
@ -610,6 +603,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch2_bio_map(struct bio *bio, void *base);
int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev)
{

View File

@ -5,8 +5,8 @@
#include "compress.h"
#include "extents.h"
#include "fs.h"
#include "rebalance.h"
#include "str_hash.h"
#include "tier.h"
#include "xattr.h"
#include <linux/dcache.h>

View File

@ -40,14 +40,22 @@ void schedule(void)
v, NULL, NULL, 0);
}
static void process_timeout(unsigned long __data)
struct process_timer {
struct timer_list timer;
struct task_struct *task;
};
static void process_timeout(struct timer_list *t)
{
wake_up_process((struct task_struct *)__data);
struct process_timer *timeout =
container_of(t, struct process_timer, timer);
wake_up_process(timeout->task);
}
long schedule_timeout(long timeout)
{
struct timer_list timer;
struct process_timer timer;
unsigned long expire;
switch (timeout)
@ -80,10 +88,11 @@ long schedule_timeout(long timeout)
expire = timeout + jiffies;
setup_timer(&timer, process_timeout, (unsigned long)current);
mod_timer(&timer, expire);
timer.task = current;
timer_setup_on_stack(&timer.timer, process_timeout, 0);
mod_timer(&timer.timer, expire);
schedule();
del_timer_sync(&timer);
del_timer_sync(&timer.timer);
timeout = expire - jiffies;
out:

View File

@ -273,7 +273,7 @@ static int timer_thread(void *arg)
BUG_ON(!timer_running());
pthread_mutex_unlock(&timer_lock);
timer->function(timer->data);
timer->function(timer);
pthread_mutex_lock(&timer_lock);
timer_seq++;

View File

@ -55,9 +55,10 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
return ret;
}
void delayed_work_timer_fn(unsigned long __data)
void delayed_work_timer_fn(struct timer_list *timer)
{
struct delayed_work *dwork = (struct delayed_work *) __data;
struct delayed_work *dwork =
container_of(timer, struct delayed_work, timer);
pthread_mutex_lock(&wq_lock);
__queue_work(dwork->wq, &dwork->work);
@ -71,8 +72,7 @@ static void __queue_delayed_work(struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
BUG_ON(timer->function != delayed_work_timer_fn ||
timer->data != (unsigned long)dwork);
BUG_ON(timer->function != delayed_work_timer_fn);
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));