diff --git a/.bcachefs_revision b/.bcachefs_revision index 37d51b2f..e267faa6 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ed4aea2ad4fa1b3891684cbd071d1a1ae9094342 +0906b1fb492e8e84f563b192fd8f458af1c1d420 diff --git a/bcachefs.c b/bcachefs.c index 53806f39..1c56ead7 100644 --- a/bcachefs.c +++ b/bcachefs.c @@ -36,10 +36,12 @@ static void usage(void) " fsck Check an existing filesystem for errors\n" "\n" "Startup/shutdown, assembly of multi device filesystems:\n" +#if 0 " assemble Assemble an existing multi device filesystem\n" " incremental Incrementally assemble an existing multi device filesystem\n" " run Start a partially assembled filesystem\n" " stop Stop a running filesystem\n" +#endif "\n" "Commands for managing a running filesystem:\n" " fs usage Show disk usage\n" @@ -150,6 +152,7 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "fsck")) return cmd_fsck(argc, argv); +#if 0 if (!strcmp(cmd, "assemble")) return cmd_assemble(argc, argv); if (!strcmp(cmd, "incremental")) @@ -158,6 +161,7 @@ int main(int argc, char *argv[]) return cmd_run(argc, argv); if (!strcmp(cmd, "stop")) return cmd_stop(argc, argv); +#endif if (!strcmp(cmd, "fs")) return fs_cmds(argc, argv); diff --git a/cmd_assemble.c b/cmd_assemble.c index 57b28026..a997e1e1 100644 --- a/cmd_assemble.c +++ b/cmd_assemble.c @@ -11,6 +11,7 @@ #include "cmds.h" #include "libbcachefs.h" +#if 0 int cmd_assemble(int argc, char *argv[]) { unsigned nr_devs = argc - 1; @@ -26,7 +27,7 @@ int cmd_assemble(int argc, char *argv[]) unsigned i; for (i = 0; i < nr_devs; i++) - assemble->devs[i] = (__u64) argv[i + 1]; + assemble->devs[i] = (unsigned long) argv[i + 1]; xioctl(bcachectl_open(), BCH_IOCTL_ASSEMBLE, assemble); return 0; @@ -38,9 +39,10 @@ int cmd_incremental(int argc, char *argv[]) die("Please supply exactly one device"); struct bch_ioctl_incremental incremental = { - .dev = (__u64) argv[1], + .dev = (unsigned long) argv[1], }; xioctl(bcachectl_open(), BCH_IOCTL_INCREMENTAL, &incremental); return 0; } +#endif diff --git a/cmd_debug.c b/cmd_debug.c index 6c2b3184..11d73b35 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -10,6 +10,7 @@ #include "libbcachefs/bcachefs.h" #include "libbcachefs/alloc.h" +#include "libbcachefs/bset.h" #include "libbcachefs/btree_cache.h" #include "libbcachefs/btree_iter.h" #include "libbcachefs/buckets.h" diff --git a/cmd_run.c b/cmd_run.c index 673d519a..1bf84e5c 100644 --- a/cmd_run.c +++ b/cmd_run.c @@ -15,6 +15,7 @@ #include "cmds.h" #include "libbcachefs.h" +#if 0 int cmd_run(int argc, char *argv[]) { return 0; @@ -29,3 +30,4 @@ int cmd_stop(int argc, char *argv[]) xioctl(fs.ioctl_fd, BCH_IOCTL_STOP); return 0; } +#endif diff --git a/cmds.h b/cmds.h index 6d21db6f..258a823d 100644 --- a/cmds.h +++ b/cmds.h @@ -12,10 +12,12 @@ int cmd_format(int argc, char *argv[]); int cmd_show_super(int argc, char *argv[]); +#if 0 int cmd_assemble(int argc, char *argv[]); int cmd_incremental(int argc, char *argv[]); int cmd_run(int argc, char *argv[]); int cmd_stop(int argc, char *argv[]); +#endif int cmd_fs_usage(int argc, char *argv[]); diff --git a/include/linux/timer.h b/include/linux/timer.h index 363f26a4..9667acf9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -6,27 +6,22 @@ struct timer_list { unsigned long expires; - void (*function)(unsigned long); - unsigned long data; + void (*function)(struct timer_list *timer); bool pending; }; -static inline void init_timer(struct timer_list *timer) +static inline void timer_setup(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags) { memset(timer, 0, sizeof(*timer)); + timer->function = func; } -#define __init_timer(_timer, _flags) init_timer(_timer) +#define timer_setup_on_stack(timer, callback, flags) \ + timer_setup(timer, callback, flags) -#define __setup_timer(_timer, _fn, _data, _flags) \ - do { \ - __init_timer((_timer), (_flags)); \ - (_timer)->function = (_fn); \ - (_timer)->data = (_data); \ - } while (0) - -#define setup_timer(timer, fn, data) \ - __setup_timer((timer), (fn), (data), 0) +#define destroy_timer_on_stack(timer) do {} while (0) static inline int timer_pending(const struct timer_list *timer) { @@ -36,8 +31,9 @@ static inline int timer_pending(const struct timer_list *timer) int del_timer(struct timer_list * timer); int del_timer_sync(struct timer_list *timer); +#define del_singleshot_timer_sync(timer) del_timer_sync(timer) + int mod_timer(struct timer_list *timer, unsigned long expires); -//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); static inline void add_timer(struct timer_list *timer) { diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 213562f2..1406c958 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -8,7 +8,7 @@ struct task_struct; struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); -void delayed_work_timer_fn(unsigned long __data); +void delayed_work_timer_fn(struct timer_list *); #define work_data_bits(work) ((unsigned long *)(&(work)->data)) @@ -44,9 +44,7 @@ struct delayed_work { #define INIT_DELAYED_WORK(_work, _func) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ - __setup_timer(&(_work)->timer, delayed_work_timer_fn, \ - (unsigned long)(_work), \ - TIMER_IRQSAFE); \ + timer_setup(&(_work)->timer, delayed_work_timer_fn, 0); \ } while (0) static inline struct delayed_work *to_delayed_work(struct work_struct *work) diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 256adb51..44f9479e 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -1393,12 +1393,10 @@ static void writepoint_drop_ptrs(struct bch_fs *c, { int i; - for (i = wp->first_ptr - 1; i >= 0; --i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev); - - if (dev_in_target(ca, target) == in_target) + for (i = wp->first_ptr - 1; i >= 0; --i) + if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev, + target) == in_target) writepoint_drop_ptr(c, wp, i); - } } static void verify_not_stale(struct bch_fs *c, const struct write_point *wp) @@ -1555,7 +1553,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, /* does writepoint have ptrs we don't want to use? */ if (target) writepoint_for_each_ptr(wp, ob, i) - if (!dev_idx_in_target(c, ob->ptr.dev, target)) { + if (!bch2_dev_in_target(c, ob->ptr.dev, target)) { swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); wp->first_ptr++; } @@ -1590,7 +1588,8 @@ alloc_done: * one in the target we want: */ if (cache_idx >= 0) { - if (!dev_in_target(ca, target)) { + if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev, + target)) { writepoint_drop_ptr(c, wp, i); } else { writepoint_drop_ptr(c, wp, cache_idx); @@ -1621,7 +1620,7 @@ alloc_done: if (ca->mi.durability && ca->mi.durability <= nr_ptrs_effective - nr_replicas && - !dev_idx_in_target(c, ob->ptr.dev, target)) { + !bch2_dev_in_target(c, ob->ptr.dev, target)) { swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); wp->first_ptr++; nr_ptrs_effective -= ca->mi.durability; @@ -1890,8 +1889,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) /* stop allocator thread: */ void bch2_dev_allocator_stop(struct bch_dev *ca) { - struct task_struct *p = ca->alloc_thread; + struct task_struct *p; + p = rcu_dereference_protected(ca->alloc_thread, 1); ca->alloc_thread = NULL; /* @@ -1926,7 +1926,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return PTR_ERR(p); get_task_struct(p); - ca->alloc_thread = p; + rcu_assign_pointer(ca->alloc_thread, p); wake_up_process(p); return 0; } @@ -2099,7 +2099,7 @@ again: if (btree_node_dirty(b) && (!b->written || b->level)) { if (btree_node_may_write(b)) { rcu_read_unlock(); - six_lock_read(&b->lock); + btree_node_lock_type(c, b, SIX_LOCK_read); bch2_btree_node_write(c, b, SIX_LOCK_read); six_unlock_read(&b->lock); goto again; diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 372cc047..00d01f46 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -103,7 +103,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) struct task_struct *p; rcu_read_lock(); - if ((p = READ_ONCE(ca->alloc_thread))) + p = rcu_dereference(ca->alloc_thread); + if (p) wake_up_process(p); rcu_read_unlock(); } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 206c30f4..879bde20 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -197,7 +197,6 @@ #include #include "bcachefs_format.h" -#include "bset.h" #include "fifo.h" #include "opts.h" #include "util.h" @@ -271,26 +270,38 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif -#define BCH_TIME_STATS() \ - BCH_TIME_STAT(btree_node_mem_alloc) \ - BCH_TIME_STAT(btree_gc) \ - BCH_TIME_STAT(btree_split) \ - BCH_TIME_STAT(btree_sort) \ - BCH_TIME_STAT(btree_read) \ - BCH_TIME_STAT(data_write) \ - BCH_TIME_STAT(data_read) \ - BCH_TIME_STAT(data_promote) \ - BCH_TIME_STAT(journal_write) \ - BCH_TIME_STAT(journal_delay) \ - BCH_TIME_STAT(journal_blocked) \ - BCH_TIME_STAT(journal_flush_seq) +#define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_gc) \ + x(btree_split) \ + x(btree_sort) \ + x(btree_read) \ + x(btree_lock_contended_read) \ + x(btree_lock_contended_intent) \ + x(btree_lock_contended_write) \ + x(data_write) \ + x(data_read) \ + x(data_promote) \ + x(journal_write) \ + x(journal_delay) \ + x(journal_blocked) \ + x(journal_flush_seq) + +enum bch_time_stats { +#define x(name) BCH_TIME_##name, + BCH_TIME_STATS() +#undef x + BCH_TIME_STAT_NR +}; #include "alloc_types.h" +#include "btree_types.h" #include "buckets_types.h" #include "clock_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" +#include "rebalance_types.h" #include "super_types.h" /* @@ -372,7 +383,7 @@ struct bch_dev { struct bch_dev_usage usage_cached; /* Allocator: */ - struct task_struct *alloc_thread; + struct task_struct __rcu *alloc_thread; /* * free: Buckets that are ready to be used @@ -447,7 +458,6 @@ enum { /* shutdown: */ BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, - BCH_FS_GC_STOPPING, /* errors: */ BCH_FS_ERROR, @@ -570,12 +580,6 @@ struct bch_fs { struct delayed_work pd_controllers_update; unsigned pd_controllers_update_seconds; - /* REBALANCE */ - struct task_struct *rebalance_thread; - struct bch_pd_controller rebalance_pd; - - atomic64_t rebalance_work_unknown_dev; - struct bch_devs_mask rw_devs[BCH_DATA_NR]; u64 capacity; /* sectors */ @@ -664,6 +668,9 @@ struct bch_fs { atomic64_t key_version; + /* REBALANCE */ + struct bch_fs_rebalance rebalance; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; @@ -714,18 +721,13 @@ struct bch_fs { unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; - unsigned rebalance_enabled:1; - unsigned rebalance_percent; bool promote_whole_extents; #define BCH_DEBUG_PARAM(name, description) bool name; BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM -#define BCH_TIME_STAT(name) \ - struct time_stats name##_time; - BCH_TIME_STATS() -#undef BCH_TIME_STAT + struct time_stats times[BCH_TIME_STAT_NR]; }; static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 48d14a30..ab8b9446 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -3,6 +3,72 @@ /* * bcachefs on disk data structures + * + * OVERVIEW: + * + * There are three main types of on disk data structures in bcachefs (this is + * reduced from 5 in bcache) + * + * - superblock + * - journal + * - btree + * + * The btree is the primary structure; most metadata exists as keys in the + * various btrees. There are only a small number of btrees, they're not + * sharded - we have one btree for extents, another for inodes, et cetera. + * + * SUPERBLOCK: + * + * The superblock contains the location of the journal, the list of devices in + * the filesystem, and in general any metadata we need in order to decide + * whether we can start a filesystem or prior to reading the journal/btree + * roots. + * + * The superblock is extensible, and most of the contents of the superblock are + * in variable length, type tagged fields; see struct bch_sb_field. + * + * Backup superblocks do not reside in a fixed location; also, superblocks do + * not have a fixed size. To locate backup superblocks we have struct + * bch_sb_layout; we store a copy of this inside every superblock, and also + * before the first superblock. + * + * JOURNAL: + * + * The journal primarily records btree updates in the order they occurred; + * journal replay consists of just iterating over all the keys in the open + * journal entries and re-inserting them into the btrees. + * + * The journal also contains entry types for the btree roots, and blacklisted + * journal sequence numbers (see journal_seq_blacklist.c). + * + * BTREE: + * + * bcachefs btrees are copy on write b+ trees, where nodes are big (typically + * 128k-256k) and log structured. We use struct btree_node for writing the first + * entry in a given node (offset 0), and struct btree_node_entry for all + * subsequent writes. + * + * After the header, btree node entries contain a list of keys in sorted order. + * Values are stored inline with the keys; since values are variable length (and + * keys effectively are variable length too, due to packing) we can't do random + * access without building up additional in memory tables in the btree node read + * path. + * + * BTREE KEYS (struct bkey): + * + * The various btrees share a common format for the key - so as to avoid + * switching in fastpath lookup/comparison code - but define their own + * structures for the key values. + * + * The size of a key/value pair is stored as a u8 in units of u64s, so the max + * size is just under 2k. The common part also contains a type tag for the + * value, and a format field indicating whether the key is packed or not (and + * also meant to allow adding new key fields in the future, if desired). + * + * bkeys, when stored within a btree node, may also be packed. In that case, the + * bkey_format in that node is used to unpack it. Packed bkeys mean that we can + * be generous with field sizes in the common part of the key format (64 bit + * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. */ #include @@ -44,12 +110,19 @@ struct bkey_format { /* Btree keys - all units are in sectors */ struct bpos { - /* Word order matches machine byte order */ -#if defined(__LITTLE_ENDIAN) + /* + * Word order matches machine byte order - btree code treats a bpos as a + * single large integer, for search/comparison purposes + * + * Note that wherever a bpos is embedded in another on disk data + * structure, it has to be byte swabbed when reading in metadata that + * wasn't written in native endian order: + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ __u32 snapshot; __u64 offset; __u64 inode; -#elif defined(__BIG_ENDIAN) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ __u64 inode; __u64 offset; /* Points to end of extent - sectors */ __u32 snapshot; @@ -83,10 +156,10 @@ struct bch_val { }; struct bversion { -#if defined(__LITTLE_ENDIAN) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ __u64 lo; __u32 hi; -#elif defined(__BIG_ENDIAN) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ __u32 hi; __u64 lo; #endif @@ -110,13 +183,13 @@ struct bkey { /* Type of the value */ __u8 type; -#if defined(__LITTLE_ENDIAN) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ __u8 pad[1]; struct bversion version; __u32 size; /* extent size, in sectors */ struct bpos p; -#elif defined(__BIG_ENDIAN) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ struct bpos p; __u32 size; /* extent size, in sectors */ struct bversion version; @@ -275,10 +348,10 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE); * * If an extent is not checksummed or compressed, when the extent is trimmed we * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the start of the data that - * is currently live. The size field in struct bkey records the current (live) - * size of the extent, and is also used to mean "size of region on disk that we - * point to" in this case. + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. * * Thus an extent that is not checksummed or compressed will consist only of a * list of bch_extent_ptrs, with none of the fields in @@ -446,11 +519,11 @@ struct bch_extent_crc128 { #elif defined (__BIG_ENDIAN_BITFIELD) __u64 compression_type:4, csum_type:4, - nonce:14, + nonce:13, offset:13, _uncompressed_size:13, _compressed_size:13, - type:3; + type:4; #endif struct bch_csum csum; } __attribute__((packed, aligned(8))); @@ -496,7 +569,7 @@ struct bch_extent_reservation { }; union bch_extent_entry { -#if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 unsigned long type; #elif __BITS_PER_LONG == 32 struct { @@ -551,10 +624,11 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION); sizeof(struct bch_extent_ptr)) / sizeof(u64)) /* Maximum possible size of an entire extent value: */ -/* There's a hack in the keylist code that needs to be fixed.. */ #define BKEY_EXTENT_VAL_U64s_MAX \ (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) +#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) + /* * Maximum possible size of an entire extent, key + value: */ #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) @@ -1378,33 +1452,4 @@ struct btree_node_entry { }; } __attribute__((packed, aligned(8))); -/* Obsolete: */ - -struct prio_set { - struct bch_csum csum; - - __le64 magic; - __le32 nonce[3]; - __le16 version; - __le16 flags; - - __u8 encrypted_start[0]; - - __le64 next_bucket; - - struct bucket_disk { - __le16 prio[2]; - __u8 gen; - } __attribute__((packed)) data[]; -} __attribute__((packed, aligned(8))); - -LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4); - -#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL) - -static inline __u64 __pset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC); -} - #endif /* _BCACHEFS_FORMAT_H */ diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 6578847b..73e5d887 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -5,6 +5,9 @@ #include #include "bcachefs_format.h" +/* + * Flags common to multiple ioctls: + */ #define BCH_FORCE_IF_DATA_LOST (1 << 0) #define BCH_FORCE_IF_METADATA_LOST (1 << 1) #define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) @@ -14,12 +17,23 @@ (BCH_FORCE_IF_DATA_DEGRADED| \ BCH_FORCE_IF_METADATA_DEGRADED) +/* + * If cleared, ioctl that refer to a device pass it as a pointer to a pathname + * (e.g. /dev/sda1); if set, the dev field is the device's index within the + * filesystem: + */ #define BCH_BY_INDEX (1 << 4) +/* + * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem + * wide superblock: + */ #define BCH_READ_DEV (1 << 5) /* global control dev: */ +/* These are currently broken, and probably unnecessary: */ +#if 0 #define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) #define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) @@ -35,12 +49,18 @@ struct bch_ioctl_incremental { __u64 pad; __u64 dev; }; +#endif /* filesystem ioctls: */ #define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) + +/* These only make sense when we also have incremental assembly */ +#if 0 #define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) #define BCH_IOCTL_STOP _IO(0xbc, 3) +#endif + #define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) #define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) #define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) @@ -52,14 +72,70 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize) +/* + * BCH_IOCTL_QUERY_UUID: get filesystem UUID + * + * Returns user visible UUID, not internal UUID (which may not ever be changed); + * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with + * this UUID. + */ struct bch_ioctl_query_uuid { uuid_le uuid; }; +#if 0 struct bch_ioctl_start { __u32 flags; __u32 pad; }; +#endif + +/* + * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem + * + * The specified device must not be open or in use. On success, the new device + * will be an online member of the filesystem just like any other member. + * + * The device must first be prepared by userspace by formatting with a bcachefs + * superblock, which is only used for passing in superblock options/parameters + * for that device (in struct bch_member). The new device's superblock should + * not claim to be a member of any existing filesystem - UUIDs on it will be + * ignored. + */ + +/* + * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem + * + * Any data present on @dev will be permanently deleted, and @dev will be + * removed from its slot in the filesystem's list of member devices. The device + * may be either offline or offline. + * + * Will fail removing @dev would leave us with insufficient read write devices + * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are + * set. + */ + +/* + * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem + * but is not open (e.g. because we started in degraded mode), bring it online + * + * all existing data on @dev will be available once the device is online, + * exactly as if @dev was present when the filesystem was first mounted + */ + +/* + * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that + * block device, without removing it from the filesystem (so it can be brought + * back online later) + * + * Data present on @dev will be unavailable while @dev is offline (unless + * replicated), but will still be intact and untouched if @dev is brought back + * online + * + * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would + * leave us with insufficient read write devices or degraded/unavailable data, + * unless the approprate BCH_FORCE_IF_* flags are set. + */ struct bch_ioctl_disk { __u32 flags; @@ -67,6 +143,16 @@ struct bch_ioctl_disk { __u64 dev; }; +/* + * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem + * + * @new_state - one of the bch_member_state states (rw, ro, failed, + * spare) + * + * Will refuse to change member state if we would then have insufficient devices + * to write to, or if it would result in degraded data (when @new_state is + * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. + */ struct bch_ioctl_disk_set_state { __u32 flags; __u8 new_state; @@ -81,6 +167,15 @@ enum bch_data_ops { BCH_DATA_OP_NR = 3, }; +/* + * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. + * scrub, rereplicate, migrate). + * + * This ioctl kicks off a job in the background, and returns a file descriptor. + * Reading from the file descriptor returns a struct bch_ioctl_data_event, + * indicating current progress, and closing the file descriptor will stop the + * job. The file descriptor is O_CLOEXEC. + */ struct bch_ioctl_data { __u32 op; __u32 flags; @@ -93,9 +188,18 @@ struct bch_ioctl_data { __u32 dev; __u32 pad; } migrate; + struct { + __u64 pad[8]; + }; }; } __attribute__((packed, aligned(8))); +enum bch_data_event { + BCH_DATA_EVENT_PROGRESS = 0, + /* XXX: add an event for reporting errors */ + BCH_DATA_EVENT_NR = 1, +}; + struct bch_ioctl_data_progress { __u8 data_type; __u8 btree_id; @@ -106,6 +210,15 @@ struct bch_ioctl_data_progress { __u64 sectors_total; } __attribute__((packed, aligned(8))); +struct bch_ioctl_data_event { + __u8 type; + __u8 pad[7]; + union { + struct bch_ioctl_data_progress p; + __u64 pad2[15]; + }; +} __attribute__((packed, aligned(8))); + struct bch_ioctl_dev_usage { __u8 state; __u8 alive; @@ -127,6 +240,19 @@ struct bch_ioctl_fs_usage { __u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX]; }; +/* + * BCH_IOCTL_USAGE: query filesystem disk space usage + * + * Returns disk space usage broken out by data type, number of replicas, and + * by component device + * + * @nr_devices - number of devices userspace allocated space for in @devs + * + * On success, @fs and @devs will be filled out appropriately and devs[i].alive + * will indicate if a device was present in that slot + * + * Returns -ERANGE if @nr_devices was too small + */ struct bch_ioctl_usage { __u16 nr_devices; __u16 pad[3]; @@ -135,6 +261,20 @@ struct bch_ioctl_usage { struct bch_ioctl_dev_usage devs[0]; }; +/* + * BCH_IOCTL_READ_SUPER: read filesystem superblock + * + * Equivalent to reading the superblock directly from the block device, except + * avoids racing with the kernel writing the superblock or having to figure out + * which block device to read + * + * @sb - buffer to read into + * @size - size of userspace allocated buffer + * @dev - device to read superblock for, if BCH_READ_DEV flag is + * specified + * + * Returns -ERANGE if buffer provided is too small + */ struct bch_ioctl_read_super { __u32 flags; __u32 pad; @@ -143,10 +283,22 @@ struct bch_ioctl_read_super { __u64 sb; }; +/* + * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to + * determine if disk is a (online) member - if so, returns device's index + * + * Returns -ENOENT if not found + */ struct bch_ioctl_disk_get_idx { __u64 dev; }; +/* + * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ struct bch_ioctl_disk_resize { __u32 flags; __u32 pad; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index f665e2e1..2d6c8a23 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -13,8 +13,6 @@ void bch2_to_binary(char *, const u64 *, unsigned); -#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) - /* bkey with split value, const */ struct bkey_s_c { const struct bkey *k; @@ -590,25 +588,31 @@ BKEY_VAL_ACCESSORS(quota, BCH_QUOTA); /* byte order helpers */ -#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + +static inline unsigned high_word_offset(const struct bkey_format *f) +{ + return f->key_u64s - 1; +} + +#define high_bit_offset 0 +#define nth_word(p, n) ((p) - (n)) + +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + +static inline unsigned high_word_offset(const struct bkey_format *f) +{ + return 0; +} + +#define high_bit_offset KEY_PACKED_BITS_START +#define nth_word(p, n) ((p) + (n)) + +#else #error edit for your odd byteorder. #endif -#ifdef __LITTLE_ENDIAN - -#define high_bit_offset 0 -#define __high_word(u64s, k) ((k)->_data + (u64s) - 1) -#define nth_word(p, n) ((p) - (n)) - -#else - -#define high_bit_offset KEY_PACKED_BITS_START -#define __high_word(u64s, k) ((k)->_data) -#define nth_word(p, n) ((p) + (n)) - -#endif - -#define high_word(format, k) __high_word((format)->key_u64s, k) +#define high_word(f, k) ((k)->_data + high_word_offset(f)) #define next_word(p) nth_word(p, 1) #define prev_word(p) nth_word(p, -1) diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 92046ae4..9a274774 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -6,6 +6,7 @@ */ #include "bcachefs.h" +#include "btree_cache.h" #include "bset.h" #include "eytzinger.h" #include "util.h" @@ -438,6 +439,10 @@ void bch2_btree_keys_free(struct btree *b) b->aux_data = NULL; } +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) { b->page_order = page_order; @@ -672,7 +677,7 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, * (and then the bits we want are at the high end, so we shift them * back down): */ -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ v >>= f->exponent & 7; #else v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); @@ -761,7 +766,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, * Then we calculate the actual shift value, from the start of the key * (k->_data), to get the key bits starting at exponent: */ -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; EBUG_ON(shift + bits > b->format.key_u64s * 64); @@ -964,10 +969,14 @@ void bch2_bset_init_first(struct btree *b, struct bset *i) set_btree_bset(b, t, i); } -void bch2_bset_init_next(struct btree *b, struct bset *i) +void bch2_bset_init_next(struct bch_fs *c, struct btree *b, + struct btree_node_entry *bne) { + struct bset *i = &bne->keys; struct bset_tree *t; + BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); BUG_ON(b->nsets >= MAX_BSETS); memset(i, 0, sizeof(*i)); diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index cc4ea5d8..153e2b3f 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -157,9 +157,6 @@ static inline bool btree_keys_expensive_checks(const struct btree *b) #endif } -struct btree_node_iter; -struct btree_node_iter_set; - enum bset_aux_tree_type { BSET_NO_AUX_TREE, BSET_RO_AUX_TREE, @@ -342,7 +339,8 @@ int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); void bch2_btree_keys_init(struct btree *, bool *); void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct btree *, struct bset *); +void bch2_bset_init_next(struct bch_fs *, struct btree *, + struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *, struct bkey_packed *); @@ -420,14 +418,6 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, /* Btree key iteration */ -struct btree_node_iter { - u8 is_extents; - - struct btree_node_iter_set { - u16 k, end; - } data[MAX_BSETS]; -}; - static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter, bool is_extents) { diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 469f8565..c950f256 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -554,7 +554,8 @@ out: b->uncompacted_whiteout_u64s = 0; bch2_btree_keys_init(b, &c->expensive_debug_checks); - bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); return b; err: diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index ad51f29c..cd5ebfbe 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include struct range_checks { @@ -264,10 +265,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) gc_pos_set(c, gc_pos_btree_node(b)); - if (max_stale > 32) + if (max_stale > 64) bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); else if (!btree_gc_rewrite_disabled(c) && (btree_gc_always_rewrite(c) || max_stale > 16)) @@ -557,7 +559,7 @@ void bch2_gc(struct bch_fs *c) out: up_write(&c->gc_lock); trace_gc_end(c); - bch2_time_stats_update(&c->btree_gc_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); /* * Wake up allocator in case it was waiting for buckets @@ -813,6 +815,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) { struct btree_iter iter; struct btree *b; + bool kthread = (current->flags & PF_KTHREAD) != 0; unsigned i; /* Sliding window of adjacent btree nodes */ @@ -859,7 +862,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) lock_seq[0] = merge[0]->lock.state.seq; - if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) { + if (kthread && kthread_should_stop()) { bch2_btree_iter_unlock(&iter); return -ESHUTDOWN; } @@ -958,13 +961,15 @@ static int bch2_gc_thread(void *arg) void bch2_gc_thread_stop(struct bch_fs *c) { - set_bit(BCH_FS_GC_STOPPING, &c->flags); - - if (c->gc_thread) - kthread_stop(c->gc_thread); + struct task_struct *p; + p = c->gc_thread; c->gc_thread = NULL; - clear_bit(BCH_FS_GC_STOPPING, &c->flags); + + if (p) { + kthread_stop(p); + put_task_struct(p); + } } int bch2_gc_thread_start(struct bch_fs *c) @@ -973,12 +978,13 @@ int bch2_gc_thread_start(struct bch_fs *c) BUG_ON(c->gc_thread); - p = kthread_create(bch2_gc_thread, c, "bcache_gc"); + p = kthread_create(bch2_gc_thread, c, "bch_gc"); if (IS_ERR(p)) return PTR_ERR(p); + get_task_struct(p); c->gc_thread = p; - wake_up_process(c->gc_thread); + wake_up_process(p); return 0; } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 1aa94229..74ffad4c 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -627,7 +627,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); if (sorting_entire_node) - bch2_time_stats_update(&c->btree_sort_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], + start_time); /* Make sure we preserve bset journal_seq: */ for (t = b->set + start_idx; t < b->set + end_idx; t++) @@ -801,7 +802,7 @@ void bch2_btree_sort_into(struct bch_fs *c, &dst->format, true); - bch2_time_stats_update(&c->btree_sort_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time); set_btree_bset_end(dst, dst->set); @@ -877,7 +878,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(b, &bne->keys); + bch2_bset_init_next(c, b, bne); bch2_btree_build_aux_trees(b); @@ -1382,7 +1383,7 @@ start: } } - bch2_time_stats_update(&c->btree_read_time, rb->start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time); bio_put(&rb->bio); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1742,6 +1743,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, BUG_ON((b->will_make_reachable != 0) != !b->written); BUG_ON(b->written >= c->opts.btree_node_size); + BUG_ON(b->written & (c->opts.block_size - 1)); BUG_ON(bset_written(b, btree_bset_last(b))); BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); @@ -1972,7 +1974,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(b, &bne->keys); + bch2_bset_init_next(c, b, bne); bch2_btree_build_aux_trees(b); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 947685f9..fa154642 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -133,7 +133,7 @@ do { \ \ six_unlock_read(&(_b)->lock); \ btree_node_wait_on_io(_b); \ - six_lock_read(&(_b)->lock); \ + btree_node_lock_type(c, b, SIX_LOCK_read); \ } \ } while (0) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 69cad3bb..70c3132e 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -42,37 +42,28 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) six_unlock_write(&b->lock); } -void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) { + struct bch_fs *c = iter->c; struct btree_iter *linked; unsigned readers = 0; - EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); - - if (six_trylock_write(&b->lock)) - return; - for_each_linked_btree_iter(iter, linked) if (linked->l[b->level].b == b && btree_node_read_locked(linked, b->level)) readers++; - if (likely(!readers)) { - six_lock_write(&b->lock); - } else { - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - atomic64_sub(__SIX_VAL(read_lock, readers), - &b->lock.state.counter); - six_lock_write(&b->lock); - atomic64_add(__SIX_VAL(read_lock, readers), - &b->lock.state.counter); - } + /* + * Must drop our read locks before calling six_lock_write() - + * six_unlock() won't do wakeups until the reader count + * goes to 0, and it's safe because we have the node intent + * locked: + */ + atomic64_sub(__SIX_VAL(read_lock, readers), + &b->lock.state.counter); + btree_node_lock_type(c, b, SIX_LOCK_write); + atomic64_add(__SIX_VAL(read_lock, readers), + &b->lock.state.counter); } bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) @@ -135,6 +126,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, struct btree_iter *iter, enum six_lock_type type) { + struct bch_fs *c = iter->c; struct btree_iter *linked; /* Can't have children locked before ancestors: */ @@ -206,7 +198,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, } } - six_lock_type(&b->lock, type); + __btree_node_lock_type(c, b, type); return true; } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 95191ba2..0097a2a2 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -4,72 +4,6 @@ #include #include "btree_types.h" -#include "bset.h" - -#define BTREE_ITER_SLOTS (1 << 0) -#define BTREE_ITER_INTENT (1 << 1) -#define BTREE_ITER_PREFETCH (1 << 2) -/* - * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for - * @pos or the first key strictly greater than @pos - */ -#define BTREE_ITER_IS_EXTENTS (1 << 3) -/* - * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: - */ -#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) -#define BTREE_ITER_ERROR (1 << 5) - -enum btree_iter_uptodate { - BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_PEEK = 1, - BTREE_ITER_NEED_RELOCK = 2, - BTREE_ITER_NEED_TRAVERSE = 3, - BTREE_ITER_END = 4, -}; - -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - struct bch_fs *c; - struct bpos pos; - - u8 flags; - unsigned uptodate:4; - enum btree_id btree_id:4; - unsigned level:4, - locks_want:4, - nodes_locked:4, - nodes_intent_locked:4; - - struct btree_iter_level { - struct btree *b; - struct btree_node_iter iter; - } l[BTREE_MAX_DEPTH]; - - u32 lock_seq[BTREE_MAX_DEPTH]; - - /* - * Current unpacked key - so that bch2_btree_iter_next()/ - * bch2_btree_iter_next_slot() can correctly advance pos. - */ - struct bkey k; - - /* - * Circular linked list of linked iterators: linked iterators share - * locks (e.g. two linked iterators may have the same node intent - * locked, or read and write locked, at the same time), and insertions - * through one iterator won't invalidate the other linked iterators. - */ - - /* Must come last: */ - struct btree_iter *next; -}; static inline void btree_iter_set_dirty(struct btree_iter *iter, enum btree_iter_uptodate u) diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 0581f44a..f48084bc 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -98,6 +98,39 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) mark_btree_node_unlocked(iter, level); } +static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) +{ + switch (type) { + case SIX_LOCK_read: + return BCH_TIME_btree_lock_contended_read; + case SIX_LOCK_intent: + return BCH_TIME_btree_lock_contended_intent; + case SIX_LOCK_write: + return BCH_TIME_btree_lock_contended_write; + default: + BUG(); + } +} + +/* + * wrapper around six locks that just traces lock contended time + */ +static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, + enum six_lock_type type) +{ + u64 start_time = local_clock(); + + six_lock_type(&b->lock, type); + bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); +} + +static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, + enum six_lock_type type) +{ + if (!six_trylock_type(&b->lock, type)) + __btree_node_lock_type(c, b, type); +} + bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, struct btree_iter *, enum six_lock_type); @@ -125,7 +158,17 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, bool bch2_btree_iter_relock(struct btree_iter *); void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); -void bch2_btree_node_lock_write(struct btree *, struct btree_iter *); + +void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); + +static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +{ + EBUG_ON(iter->l[b->level].b != b); + EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); + + if (!six_trylock_write(&b->lock)) + __bch2_btree_node_lock_write(b, iter); +} #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 8854305d..f62c96d9 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -176,6 +176,79 @@ struct btree_cache { struct closure_waitlist alloc_wait; }; +struct btree_node_iter { + u8 is_extents; + + struct btree_node_iter_set { + u16 k, end; + } data[MAX_BSETS]; +}; + +#define BTREE_ITER_SLOTS (1 << 0) +#define BTREE_ITER_INTENT (1 << 1) +#define BTREE_ITER_PREFETCH (1 << 2) +/* + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for + * @pos or the first key strictly greater than @pos + */ +#define BTREE_ITER_IS_EXTENTS (1 << 3) +/* + * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: + */ +#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) +#define BTREE_ITER_ERROR (1 << 5) + +enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, + BTREE_ITER_NEED_PEEK = 1, + BTREE_ITER_NEED_RELOCK = 2, + BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_END = 4, +}; + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ +struct btree_iter { + struct bch_fs *c; + struct bpos pos; + + u8 flags; + unsigned uptodate:4; + enum btree_id btree_id:4; + unsigned level:4, + locks_want:4, + nodes_locked:4, + nodes_intent_locked:4; + + struct btree_iter_level { + struct btree *b; + struct btree_node_iter iter; + } l[BTREE_MAX_DEPTH]; + + u32 lock_seq[BTREE_MAX_DEPTH]; + + /* + * Current unpacked key - so that bch2_btree_iter_next()/ + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; + + /* + * Circular linked list of linked iterators: linked iterators share + * locks (e.g. two linked iterators may have the same node intent + * locked, or read and write locked, at the same time), and insertions + * through one iterator won't invalidate the other linked iterators. + */ + + /* Must come last: */ + struct btree_iter *next; +}; + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index adba3092..c3ecc1e9 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -237,7 +237,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, clear_btree_node_noevict(b); - six_lock_write(&b->lock); + btree_node_lock_type(c, b, SIX_LOCK_write); bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -622,7 +622,7 @@ static void btree_update_nodes_reachable(struct closure *cl) * b->will_make_reachable prevented it from being written, so * write it now if it needs to be written: */ - six_lock_read(&b->lock); + btree_node_lock_type(c, b, SIX_LOCK_read); bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); six_unlock_read(&b->lock); mutex_lock(&c->btree_interior_update_lock); @@ -647,8 +647,10 @@ static void btree_update_wait_on_journal(struct closure *cl) ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); if (ret < 0) goto err; - if (!ret) + if (!ret) { continue_at(cl, btree_update_wait_on_journal, system_wq); + return; + } bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); err: @@ -679,7 +681,7 @@ retry: if (!six_trylock_read(&b->lock)) { mutex_unlock(&c->btree_interior_update_lock); - six_lock_read(&b->lock); + btree_node_lock_type(c, b, SIX_LOCK_read); six_unlock_read(&b->lock); goto retry; } @@ -720,7 +722,7 @@ retry: if (!six_trylock_read(&b->lock)) { mutex_unlock(&c->btree_interior_update_lock); - six_lock_read(&b->lock); + btree_node_lock_type(c, b, SIX_LOCK_read); six_unlock_read(&b->lock); goto retry; } @@ -1456,7 +1458,7 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_iter_node_replace(iter, n2); bch2_btree_iter_node_replace(iter, n1); - bch2_time_stats_update(&c->btree_split_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time); } static void @@ -1795,8 +1797,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, bch2_btree_node_write(c, n, SIX_LOCK_intent); if (parent) { - bch2_btree_insert_node(as, parent, iter, - &keylist_single(&n->key)); + bch2_keylist_add(&as->parent_keys, &n->key); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys); } else { bch2_btree_set_root(as, n, iter); } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 3e66d69e..25bfc7ab 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -226,11 +226,30 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i) return (void *) i > write_block(b); } -static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b, - struct bset *i) +static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, + struct btree *b, + void *end) { - return round_up(bset_byte_offset(b, vstruct_end(i)), - block_bytes(c)) >> 9; + ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + + b->whiteout_u64s + + b->uncompacted_whiteout_u64s; + ssize_t total = c->opts.btree_node_size << 6; + + return total - used; +} + +static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + struct btree *b) +{ + ssize_t remaining = __bch_btree_u64s_remaining(c, b, + btree_bkey_last(b, bset_tree_last(b))); + + BUG_ON(remaining < 0); + + if (bset_written(b, btree_bset_last(b))) + return 0; + + return remaining; } static inline unsigned btree_write_set_buffer(struct btree *b) @@ -246,20 +265,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { struct bset *i = btree_bset_last(b); - unsigned offset = max_t(unsigned, b->written << 9, - bset_byte_offset(b, vstruct_end(i))); - ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t) - (offset + sizeof(struct btree_node_entry) + - b->whiteout_u64s * sizeof(u64) + - b->uncompacted_whiteout_u64s * sizeof(u64)); + struct btree_node_entry *bne = max(write_block(b), + (void *) btree_bkey_last(b, bset_tree_last(b))); + ssize_t remaining_space = + __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); - EBUG_ON(offset > btree_bytes(c)); - - if ((unlikely(bset_written(b, i)) && - remaining_space > block_bytes(c)) || - (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && - remaining_space > btree_write_set_buffer(b))) - return (void *) b->data + offset; + if (unlikely(bset_written(b, i))) { + if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + return bne; + } else { + if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) + return bne; + } return NULL; } @@ -285,23 +303,6 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, } } -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) -{ - struct bset *i = btree_bset_last(b); - unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + - b->whiteout_u64s + - b->uncompacted_whiteout_u64s; - unsigned total = c->opts.btree_node_size << 6; - - EBUG_ON(used > total); - - if (bset_written(b, i)) - return 0; - - return total - used; -} - /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 92fb5f61..cc41140f 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -108,7 +108,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); - six_lock_read(&b->lock); + btree_node_lock_type(c, b, SIX_LOCK_read); bch2_btree_node_write_cond(c, b, (btree_current_write(b) == w && w->journal.pin_list == journal_seq_pin(j, seq))); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 1f944cb8..5dda22c7 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -555,9 +555,9 @@ static void bch2_mark_pointer(struct bch_fs *c, return; } - v = READ_ONCE(g->_mark.counter); + v = atomic64_read(&g->_mark.v); do { - new.counter = old.counter = v; + new.v.counter = old.v.counter = v; saturated = 0; /* @@ -600,9 +600,9 @@ static void bch2_mark_pointer(struct bch_fs *c, g->_mark = new; break; } - } while ((v = cmpxchg(&g->_mark.counter, - old.counter, - new.counter)) != old.counter); + } while ((v = atomic64_cmpxchg(&g->_mark.v, + old.v.counter, + new.v.counter)) != old.v.counter); bch2_dev_usage_update(c, ca, old, new); @@ -957,7 +957,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca) kvpfree(ca->buckets_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); - kvpfree(ca->buckets, sizeof(struct bucket_array) + + kvpfree(rcu_dereference_protected(ca->buckets, 1), + sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); free_percpu(ca->usage_percpu); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 01f0b314..aefe6027 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -16,15 +16,15 @@ #define bucket_cmpxchg(g, new, expr) \ ({ \ - u64 _v = READ_ONCE((g)->_mark.counter); \ + u64 _v = atomic64_read(&(g)->_mark.v); \ struct bucket_mark _old; \ \ do { \ - (new).counter = _old.counter = _v; \ + (new).v.counter = _old.v.counter = _v; \ expr; \ - } while ((_v = cmpxchg(&(g)->_mark.counter, \ - _old.counter, \ - (new).counter)) != _old.counter);\ + } while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \ + _old.v.counter, \ + (new).v.counter)) != _old.v.counter);\ _old; \ }) diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 28bd2c59..10f00861 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -6,7 +6,7 @@ struct bucket_mark { union { struct { - u64 counter; + atomic64_t v; }; struct { diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 8403bae6..5593b9a1 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -54,6 +54,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, return ca; } +#if 0 static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) { struct bch_ioctl_assemble arg; @@ -127,14 +128,17 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg return 0; } +#endif static long bch2_global_ioctl(unsigned cmd, void __user *arg) { switch (cmd) { +#if 0 case BCH_IOCTL_ASSEMBLE: return bch2_ioctl_assemble(arg); case BCH_IOCTL_INCREMENTAL: return bch2_ioctl_incremental(arg); +#endif default: return -ENOTTY; } @@ -148,6 +152,7 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c, sizeof(c->sb.user_uuid)); } +#if 0 static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) { if (arg.flags || arg.pad) @@ -161,6 +166,7 @@ static long bch2_ioctl_stop(struct bch_fs *c) bch2_fs_stop(c); return 0; } +#endif static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) { @@ -294,18 +300,19 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, { struct bch_data_ctx *ctx = file->private_data; struct bch_fs *c = ctx->c; - struct bch_ioctl_data_progress p = { - .data_type = ctx->stats.data_type, - .btree_id = ctx->stats.iter.btree_id, - .pos = ctx->stats.iter.pos, - .sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)), + struct bch_ioctl_data_event e = { + .type = BCH_DATA_EVENT_PROGRESS, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.iter.btree_id, + .p.pos = ctx->stats.iter.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)), }; - if (len != sizeof(p)) + if (len < sizeof(e)) return -EINVAL; - return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p); + return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); } static const struct file_operations bcachefs_data_ops = { @@ -419,7 +426,7 @@ static long bch2_ioctl_usage(struct bch_fs *c, if (ca->dev_idx >= arg.nr_devices) { percpu_ref_put(&ca->ref); - return -ENOSPC; + return -ERANGE; } if (percpu_ref_tryget(&ca->io_ref)) { @@ -539,10 +546,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) return -EPERM; switch (cmd) { +#if 0 case BCH_IOCTL_START: BCH_IOCTL(start, struct bch_ioctl_start); case BCH_IOCTL_STOP: return bch2_ioctl_stop(c); +#endif case BCH_IOCTL_READ_SUPER: BCH_IOCTL(read_super, struct bch_ioctl_read_super); case BCH_IOCTL_DISK_GET_IDX: diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 6d8543eb..28d086bc 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -421,7 +421,7 @@ static struct bch_csum bch2_checksum_merge(unsigned type, BUG_ON(!bch2_checksum_mergeable(type)); while (b_len) { - unsigned b = min(b_len, PAGE_SIZE); + unsigned b = min_t(unsigned, b_len, PAGE_SIZE); a.lo = bch2_checksum_update(type, a.lo, page_address(ZERO_PAGE(0)), b); diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 650be8ce..c67376f9 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -42,7 +42,8 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) } struct io_clock_wait { - struct io_timer timer; + struct io_timer io_timer; + struct timer_list cpu_timer; struct task_struct *task; int expired; }; @@ -50,7 +51,16 @@ struct io_clock_wait { static void io_clock_wait_fn(struct io_timer *timer) { struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, timer); + struct io_clock_wait, io_timer); + + wait->expired = 1; + wake_up_process(wait->task); +} + +static void io_clock_cpu_timeout(struct timer_list *timer) +{ + struct io_clock_wait *wait = container_of(timer, + struct io_clock_wait, cpu_timer); wait->expired = 1; wake_up_process(wait->task); @@ -61,35 +71,38 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) struct io_clock_wait wait; /* XXX: calculate sleep time rigorously */ - wait.timer.expire = until; - wait.timer.fn = io_clock_wait_fn; + wait.io_timer.expire = until; + wait.io_timer.fn = io_clock_wait_fn; wait.task = current; wait.expired = 0; - bch2_io_timer_add(clock, &wait.timer); + bch2_io_timer_add(clock, &wait.io_timer); schedule(); - bch2_io_timer_del(clock, &wait.timer); + bch2_io_timer_del(clock, &wait.io_timer); } -/* - * _only_ to be used from a kthread - */ void bch2_kthread_io_clock_wait(struct io_clock *clock, - unsigned long until) + unsigned long io_until, + unsigned long cpu_timeout) { + bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait; - /* XXX: calculate sleep time rigorously */ - wait.timer.expire = until; - wait.timer.fn = io_clock_wait_fn; + wait.io_timer.expire = io_until; + wait.io_timer.fn = io_clock_wait_fn; wait.task = current; wait.expired = 0; - bch2_io_timer_add(clock, &wait.timer); + bch2_io_timer_add(clock, &wait.io_timer); + + timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); + + if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) + mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); while (1) { set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread && kthread_should_stop()) break; if (wait.expired) @@ -100,7 +113,9 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, } __set_current_state(TASK_RUNNING); - bch2_io_timer_del(clock, &wait.timer); + del_singleshot_timer_sync(&wait.cpu_timer); + destroy_timer_on_stack(&wait.cpu_timer); + bch2_io_timer_del(clock, &wait.io_timer); } static struct io_timer *get_expired_timer(struct io_clock *clock, diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h index af6b2b39..1e2a7dea 100644 --- a/libbcachefs/clock.h +++ b/libbcachefs/clock.h @@ -3,7 +3,8 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); -void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long); +void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, + unsigned long); void bch2_increment_clock(struct bch_fs *, unsigned, int); void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 1af62621..6379905b 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -480,7 +480,7 @@ static const unsigned bch2_compression_opt_to_feature[] = { #undef BCH_FEATURE_NONE -int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) +static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) { int ret = 0; @@ -529,26 +529,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) mempool_exit(&c->compression_bounce[READ]); } -static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kvpmalloc(size, gfp_mask); -} - -void mempool_kvpfree(void *element, void *pool_data) -{ - size_t size = (size_t)pool_data; - kvpfree(element, size); -} - -static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return !mempool_initialized(pool) - ? mempool_init(pool, min_nr, mempool_kvpmalloc, - mempool_kvpfree, (void *) size) - : 0; -} - static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t max_extent = c->sb.encoded_extent_max << 9; @@ -611,6 +591,9 @@ have_compressed: if (i->decompress_workspace) decompress_workspace_needed = true; + if (mempool_initialized(&c->compress_workspace[i->type])) + continue; + ret = mempool_init_kvpmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace); diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index c129a33e..cd200cbe 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -16,8 +16,8 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, + struct bch_sb_field *f) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); @@ -162,7 +162,8 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) } } - old_g = c->disk_groups; + old_g = rcu_dereference_protected(c->disk_groups, + lockdep_is_held(&c->sb_lock)); rcu_assign_pointer(c->disk_groups, cpu_g); if (old_g) kfree_rcu(old_g, rcu); @@ -193,6 +194,36 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe } } +bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_NULL: + return false; + case TARGET_DEV: + return dev == t.dev; + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g; + const struct bch_devs_mask *m; + bool ret; + + rcu_read_lock(); + g = rcu_dereference(c->disk_groups); + m = t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + + ret = m ? test_bit(dev, m->d) : false; + rcu_read_unlock(); + + return ret; + } + default: + BUG(); + } +} + static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, unsigned parent, const char *name, unsigned namelen) diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index 9da9805a..e92c0dc5 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -53,34 +53,8 @@ static inline struct target target_decode(unsigned target) return (struct target) { .type = TARGET_NULL }; } -static inline bool dev_in_target(struct bch_dev *ca, unsigned target) -{ - struct target t = target_decode(target); - - switch (t.type) { - case TARGET_NULL: - return false; - case TARGET_DEV: - return ca->dev_idx == t.dev; - case TARGET_GROUP: - return ca->mi.group && ca->mi.group - 1 == t.group; - default: - BUG(); - } -} - -static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target) -{ - bool ret; - - rcu_read_lock(); - ret = dev_in_target(rcu_dereference(c->devs[dev]), target); - rcu_read_unlock(); - - return ret; -} - const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); +bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 9efaa1ff..b85af711 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -144,7 +144,7 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group const struct bch_extent_ptr *ptr; extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (ca->mi.group && ca->mi.group - 1 == group) @@ -159,13 +159,11 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ { const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - - if (dev_in_target(ca, target) && - (!ptr->cached || !ptr_stale(ca, ptr))) + extent_for_each_ptr(e, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) return ptr; - } return NULL; } @@ -732,7 +730,7 @@ err: bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " "gen %i mark %08x", err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.counter); + mark.gen, (unsigned) mark.v.counter); } void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, @@ -2024,7 +2022,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, int n = bch2_extent_ptr_durability(c, ptr); if (n && n <= extra && - !dev_in_target(c->devs[ptr->dev], target)) { + !bch2_dev_in_target(c, ptr->dev, target)) { ptr->cached = true; extra -= n; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 338e9e01..08ad9647 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -278,24 +278,38 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) .uncompressed_size = k->size, .live_size = k->size, }; - case BCH_EXTENT_CRC32: - return (struct bch_extent_crc_unpacked) { + case BCH_EXTENT_CRC32: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc32), - .csum.lo = (__force __le64) crc->crc32.csum, }; - case BCH_EXTENT_CRC64: - return (struct bch_extent_crc_unpacked) { + + *((__le32 *) &ret.csum.lo) = crc->crc32.csum; + + memcpy(&ret.csum.lo, &crc->crc32.csum, + sizeof(crc->crc32.csum)); + + return ret; + } + case BCH_EXTENT_CRC64: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc64), .nonce = crc->crc64.nonce, .csum.lo = (__force __le64) crc->crc64.csum_lo, - .csum.hi = (__force __le64) crc->crc64.csum_hi, }; - case BCH_EXTENT_CRC128: - return (struct bch_extent_crc_unpacked) { + + *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; + + return ret; + } + case BCH_EXTENT_CRC128: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc128), .nonce = crc->crc128.nonce, .csum = crc->crc128.csum, }; + + return ret; + } default: BUG(); } diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index a2455b42..1d9464af 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -678,7 +678,7 @@ static void bch2_clear_page_bits(struct page *page) if (!PagePrivate(page)) return; - s = xchg(page_state(page), (struct bch_page_state) { .v = 0 }); + s.v = xchg(&page_state(page)->v, 0); ClearPagePrivate(page); if (s.dirty_sectors) @@ -1020,12 +1020,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; - extent_for_each_ptr_crc(e, ptr, crc) - want_full_extent |= !!crc.csum_type | - !!crc.compression_type; + extent_for_each_crc(e, crc, i) + want_full_extent |= ((crc.csum_type != 0) | + (crc.compression_type != 0)); } readpage_bio_extend(readpages_iter, @@ -1850,8 +1850,7 @@ err_wait_io: dio->loop = true; if (!dio->sync) { - continue_at_noreturn(&dio->cl, - bch2_dio_write_loop_async, NULL); + continue_at(&dio->cl, bch2_dio_write_loop_async, NULL); return -EIOCBQUEUED; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 2991a0dd..c554a987 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -610,9 +610,10 @@ static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) { if (nr >= b->size) { - size_t new_size = max(max(PAGE_SIZE * 8, - b->size * 2), - nr + 1); + size_t new_size = max_t(size_t, max_t(size_t, + PAGE_SIZE * 8, + b->size * 2), + nr + 1); void *n; new_size = roundup_pow_of_two(new_size); @@ -642,7 +643,7 @@ struct pathbuf { static int path_down(struct pathbuf *p, u64 inum) { if (p->nr == p->size) { - size_t new_size = max(256UL, p->size * 2); + size_t new_size = max_t(size_t, 256UL, p->size * 2); void *n = krealloc(p->entries, new_size * sizeof(p->entries[0]), GFP_KERNEL); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index bb656522..3762fb92 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -21,10 +21,10 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "rebalance.h" #include "replicas.h" #include "super.h" #include "super-io.h" -#include "tier.h" #include #include @@ -269,7 +269,7 @@ static void bch2_write_done(struct closure *cl) percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); - bch2_time_stats_update(&c->data_write_time, op->start_time); + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); closure_return(cl); } @@ -842,20 +842,24 @@ again: } while (ret); continue_at(cl, bch2_write_index, index_update_wq(op)); + return; err: op->error = ret; continue_at(cl, !bch2_keylist_empty(&op->insert_keys) ? bch2_write_index : bch2_write_done, index_update_wq(op)); + return; flush_io: closure_sync(cl); if (!bch2_keylist_empty(&op->insert_keys)) { __bch2_write_index(op); - if (op->error) + if (op->error) { continue_at_nobarrier(cl, bch2_write_done, NULL); + return; + } } goto again; @@ -901,6 +905,7 @@ void bch2_write(struct closure *cl) if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) bch2_disk_reservation_put(c, &op->res); closure_return(cl); + return; } bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE); @@ -974,7 +979,8 @@ static void promote_done(struct closure *cl) container_of(cl, struct promote_op, cl); struct bch_fs *c = op->write.op.c; - bch2_time_stats_update(&c->data_promote_time, op->start_time); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); promote_free(c, op); @@ -1048,7 +1054,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9; bch2_bio_map(&(*rbio)->bio, NULL); - if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO)) + if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO)) goto err; (*rbio)->bounce = true; @@ -1174,7 +1180,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) static void bch2_rbio_done(struct bch_read_bio *rbio) { - bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time); + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); bio_endio(&rbio->bio); } @@ -1486,7 +1493,7 @@ csum_err: } bch2_dev_io_error(ca, - "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", + "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 4cec7bb5..6759810b 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -365,6 +365,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j) ssize_t bch2_journal_print_debug(struct journal *, char *); ssize_t bch2_journal_print_pins(struct journal *, char *); +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 2fd0d646..36ba6a4d 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -324,7 +324,7 @@ struct jset_entry_ops { struct jset_entry *, int); }; -const struct jset_entry_ops bch2_jset_entry_ops[] = { +static const struct jset_entry_ops bch2_jset_entry_ops[] = { #define x(f, nr) \ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ .validate = journal_entry_validate_##f, \ @@ -696,6 +696,7 @@ out: kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); closure_return(cl); + return; err: mutex_lock(&jlist->lock); jlist->ret = ret; @@ -716,19 +717,6 @@ void bch2_journal_entries_free(struct list_head *list) } } -static inline bool journal_has_keys(struct list_head *list) -{ - struct journal_replay *i; - struct jset_entry *entry; - struct bkey_i *k, *_n; - - list_for_each_entry(i, list, list) - for_each_jset_key(k, _n, entry, &i->j) - return true; - - return false; -} - int bch2_journal_read(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; @@ -737,8 +725,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_entry_pin_list *p; struct bch_dev *ca; u64 cur_seq, end_seq, seq; - unsigned iter, keys = 0, entries = 0; - size_t nr; + unsigned iter; + size_t entries = 0; + u64 nr, keys = 0; bool degraded = false; int ret = 0; @@ -772,9 +761,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) return BCH_FSCK_REPAIR_IMPOSSIBLE; } - fsck_err_on(c->sb.clean && journal_has_keys(list), c, - "filesystem marked clean but journal has keys to replay"); - list_for_each_entry(i, list, list) { ret = jset_validate_entries(c, &i->j, READ); if (ret) @@ -797,15 +783,27 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) } } + list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; + + for_each_jset_key(k, _n, entry, &i->j) + keys++; + } + i = list_last_entry(list, struct journal_replay, list); nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; + fsck_err_on(c->sb.clean && (keys || nr > 1), c, + "filesystem marked clean but journal not empty (%llu keys in %llu entries)", + keys, nr); + if (nr > j->pin.size) { free_fifo(&j->pin); init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%zu open entries)", nr); + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); return -ENOMEM; } } @@ -844,8 +842,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_replay, list)->j.seq); list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; bool blacklisted; mutex_lock(&j->blacklist_lock); @@ -867,13 +863,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) journal_last_seq(j), end_seq); cur_seq = le64_to_cpu(i->j.seq) + 1; - - for_each_jset_key(k, _n, entry, &i->j) - keys++; entries++; } - bch_info(c, "journal read done, %i keys in %i entries, seq %llu", + bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu", keys, entries, journal_cur_seq(j)); fsck_err: return ret; @@ -1361,6 +1354,7 @@ void bch2_journal_write(struct closure *cl) bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); + return; } /* @@ -1417,6 +1411,7 @@ no_io: ptr->offset += sectors; continue_at(cl, journal_write_done, system_highpri_wq); + return; err: bch2_inconsistent_error(c); continue_at(cl, journal_write_done, system_highpri_wq); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index b5301d96..567289e2 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -247,7 +247,7 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) if (!bl->nr_entries || is_power_of_2(bl->nr_entries)) { n = krealloc(bl->entries, - max(bl->nr_entries * 2, 8UL) * sizeof(*n), + max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n), GFP_KERNEL); if (!n) { ret = -ENOMEM; diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index a8c8883b..3106759e 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -55,9 +55,6 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l) _k != (_keylist)->top; \ _k = bkey_next(_k)) -#define keylist_single(k) \ - ((struct keylist) { .keys = k, .top = bkey_next(k) }) - static inline u64 keylist_sectors(struct keylist *keys) { struct bkey_i *k; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 0431fb81..3e52b7a2 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -306,16 +306,16 @@ static void move_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - if (likely(!io->rbio.bio.bi_status && - !io->rbio.hole)) { - bch2_migrate_read_done(&io->write, &io->rbio); - - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - closure_call(&io->write.op.cl, bch2_write, NULL, cl); - continue_at(cl, move_write_done, NULL); + if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + closure_return_with_destructor(cl, move_free); + return; } - closure_return_with_destructor(cl, move_free); + bch2_migrate_read_done(&io->write, &io->rbio); + + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + closure_call(&io->write.op.cl, bch2_write, NULL, cl); + continue_at(cl, move_write_done, NULL); } static inline struct moving_io *next_pending_write(struct moving_context *ctxt) @@ -411,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c, io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9; bch2_bio_map(&io->write.op.wbio.bio, NULL); - if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) + if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) goto err_free; io->rbio.opts = io_opts; diff --git a/libbcachefs/move.h b/libbcachefs/move.h index bc98f94b..bc87e067 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "buckets.h" #include "io_types.h" +#include "move_types.h" struct bch_read_bio; struct moving_context; @@ -48,16 +49,6 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, enum bkey_type, struct bkey_s_c_extent, struct bch_io_opts *, struct data_opts *); -struct bch_move_stats { - enum bch_data_type data_type; - struct btree_iter iter; - - atomic64_t keys_moved; - atomic64_t sectors_moved; - atomic64_t sectors_seen; - atomic64_t sectors_raced; -}; - int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, struct write_point_specifier, struct bpos, struct bpos, diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h new file mode 100644 index 00000000..832542a8 --- /dev/null +++ b/libbcachefs/move_types.h @@ -0,0 +1,14 @@ +#ifndef _BCACHEFS_MOVE_TYPES_H +#define _BCACHEFS_MOVE_TYPES_H + +struct bch_move_stats { + enum bch_data_type data_type; + struct btree_iter iter; + + atomic64_t keys_moved; + atomic64_t sectors_moved; + atomic64_t sectors_seen; + atomic64_t sectors_raced; +}; + +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 28dabca7..7bef4561 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -241,7 +241,8 @@ static int bch2_copygc_thread(void *arg) ca->mi.bucket_size; if (available > reserve) { next = last + available - reserve; - bch2_kthread_io_clock_wait(clock, next); + bch2_kthread_io_clock_wait(clock, next, + MAX_SCHEDULE_TIMEOUT); continue; } @@ -252,7 +253,8 @@ static int bch2_copygc_thread(void *arg) fragmented = usage.sectors_fragmented; if (fragmented < reserve) { next = last + reserve - fragmented; - bch2_kthread_io_clock_wait(clock, next); + bch2_kthread_io_clock_wait(clock, next, + MAX_SCHEDULE_TIMEOUT); continue; } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c new file mode 100644 index 00000000..4154b1e9 --- /dev/null +++ b/libbcachefs/rebalance.c @@ -0,0 +1,341 @@ + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" +#include "extents.h" +#include "io.h" +#include "move.h" +#include "rebalance.h" +#include "super-io.h" + +#include +#include +#include +#include + +static inline bool rebalance_ptr_pred(struct bch_fs *c, + const struct bch_extent_ptr *ptr, + struct bch_extent_crc_unpacked crc, + struct bch_io_opts *io_opts) +{ + if (io_opts->background_target && + !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && + !ptr->cached) + return true; + + if (io_opts->background_compression && + crc.compression_type != + bch2_compression_opt_to_type[io_opts->background_compression]) + return true; + + return false; +} + +void bch2_rebalance_add_key(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + struct bkey_s_c_extent e; + + if (!bkey_extent_is_data(k.k)) + return; + + if (!io_opts->background_target && + !io_opts->background_compression) + return; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (atomic64_add_return(crc.compressed_size, + &ca->rebalance_work) == + crc.compressed_size) + rebalance_wakeup(c); + } +} + +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == + sectors) + rebalance_wakeup(c); +} + +static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return DATA_SKIP; + + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) + goto found; + + return DATA_SKIP; +found: + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; +} + +struct rebalance_work { + int dev_most_full_idx; + unsigned dev_most_full_percent; + u64 dev_most_full_work; + u64 dev_most_full_capacity; + u64 total_work; +}; + +static void rebalance_work_accumulate(struct rebalance_work *w, + u64 dev_work, u64 unknown_dev, u64 capacity, int idx) +{ + unsigned percent_full; + u64 work = dev_work + unknown_dev; + + if (work < dev_work || work < unknown_dev) + work = U64_MAX; + work = min(work, capacity); + + percent_full = div_u64(work * 100, capacity); + + if (percent_full >= w->dev_most_full_percent) { + w->dev_most_full_idx = idx; + w->dev_most_full_percent = percent_full; + w->dev_most_full_work = work; + w->dev_most_full_capacity = capacity; + } + + if (w->total_work + dev_work >= w->total_work && + w->total_work + dev_work >= dev_work) + w->total_work += dev_work; +} + +static struct rebalance_work rebalance_work(struct bch_fs *c) +{ + struct bch_dev *ca; + struct rebalance_work ret = { .dev_most_full_idx = -1 }; + u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); + unsigned i; + + for_each_online_member(ca, c, i) + rebalance_work_accumulate(&ret, + atomic64_read(&ca->rebalance_work), + unknown_dev, + bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket), + i); + + rebalance_work_accumulate(&ret, + unknown_dev, 0, c->capacity, -1); + + return ret; +} + +static void rebalance_work_reset(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) + atomic64_set(&ca->rebalance_work, 0); + + atomic64_set(&c->rebalance.work_unknown_dev, 0); +} + +static unsigned long curr_cputime(void) +{ + u64 utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return nsecs_to_jiffies(utime + stime); +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct bch_fs_rebalance *r = &c->rebalance; + struct io_clock *clock = &c->io_clock[WRITE]; + struct rebalance_work w, p; + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; + unsigned long io_start; + long throttle; + + set_freezable(); + + io_start = atomic_long_read(&clock->now); + p = rebalance_work(c); + prev_start = jiffies; + prev_cputime = curr_cputime(); + + while (!kthread_wait_freezable(r->enabled)) { + start = jiffies; + cputime = curr_cputime(); + + prev_run_time = start - prev_start; + prev_run_cputime = cputime - prev_cputime; + + w = rebalance_work(c); + BUG_ON(!w.dev_most_full_capacity); + + if (!w.total_work) { + r->state = REBALANCE_WAITING; + kthread_wait_freezable(rebalance_work(c).total_work); + continue; + } + + /* + * If there isn't much work to do, throttle cpu usage: + */ + throttle = prev_run_cputime * 100 / + max(1U, w.dev_most_full_percent) - + prev_run_time; + + if (w.dev_most_full_percent < 20 && throttle > 0) { + r->state = REBALANCE_THROTTLED; + r->throttled_until_iotime = io_start + + div_u64(w.dev_most_full_capacity * + (20 - w.dev_most_full_percent), + 50); + r->throttled_until_cputime = start + throttle; + + bch2_kthread_io_clock_wait(clock, + r->throttled_until_iotime, + throttle); + continue; + } + + /* minimum 1 mb/sec: */ + r->pd.rate.rate = + max_t(u64, 1 << 11, + r->pd.rate.rate * + max(p.dev_most_full_percent, 1U) / + max(w.dev_most_full_percent, 1U)); + + io_start = atomic_long_read(&clock->now); + p = w; + prev_start = start; + prev_cputime = cputime; + + r->state = REBALANCE_RUNNING; + memset(&r->move_stats, 0, sizeof(r->move_stats)); + rebalance_work_reset(c); + + bch2_move_data(c, + /* ratelimiting disabled for now */ + NULL, /* &r->pd.rate, */ + writepoint_ptr(&c->rebalance_write_point), + POS_MIN, POS_MAX, + rebalance_pred, NULL, + &r->move_stats); + } + + return 0; +} + +ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = out + PAGE_SIZE; + struct bch_fs_rebalance *r = &c->rebalance; + struct rebalance_work w = rebalance_work(c); + char h1[21], h2[21]; + + bch2_hprint(h1, w.dev_most_full_work << 9); + bch2_hprint(h2, w.dev_most_full_capacity << 9); + out += scnprintf(out, end - out, + "fullest_dev (%i):\t%s/%s\n", + w.dev_most_full_idx, h1, h2); + + bch2_hprint(h1, w.total_work << 9); + bch2_hprint(h2, c->capacity << 9); + out += scnprintf(out, end - out, + "total work:\t\t%s/%s\n", + h1, h2); + + out += scnprintf(out, end - out, + "rate:\t\t\t%u\n", + r->pd.rate.rate); + + switch (r->state) { + case REBALANCE_WAITING: + out += scnprintf(out, end - out, "waiting\n"); + break; + case REBALANCE_THROTTLED: + bch2_hprint(h1, + (r->throttled_until_iotime - + atomic_long_read(&c->io_clock[WRITE].now)) << 9); + out += scnprintf(out, end - out, + "throttled for %lu sec or %s io\n", + (r->throttled_until_cputime - jiffies) / HZ, + h1); + break; + case REBALANCE_RUNNING: + out += scnprintf(out, end - out, "running\n"); + out += scnprintf(out, end - out, "pos %llu:%llu\n", + r->move_stats.iter.pos.inode, + r->move_stats.iter.pos.offset); + break; + } + + return out - buf; +} + +void bch2_rebalance_stop(struct bch_fs *c) +{ + struct task_struct *p; + + c->rebalance.pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance.pd.rate); + + p = rcu_dereference_protected(c->rebalance.thread, 1); + c->rebalance.thread = NULL; + + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); + + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_rebalance_start(struct bch_fs *c) +{ + struct task_struct *p; + + if (c->opts.nochanges) + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); + if (IS_ERR(p)) + return PTR_ERR(p); + + get_task_struct(p); + rcu_assign_pointer(c->rebalance.thread, p); + wake_up_process(p); + return 0; +} + +void bch2_fs_rebalance_init(struct bch_fs *c) +{ + bch2_pd_controller_init(&c->rebalance.pd); + + atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); +} diff --git a/libbcachefs/tier.h b/libbcachefs/rebalance.h similarity index 65% rename from libbcachefs/tier.h rename to libbcachefs/rebalance.h index 0c66dfea..2e6aa677 100644 --- a/libbcachefs/tier.h +++ b/libbcachefs/rebalance.h @@ -1,12 +1,14 @@ -#ifndef _BCACHEFS_TIER_H -#define _BCACHEFS_TIER_H +#ifndef _BCACHEFS_REBALANCE_H +#define _BCACHEFS_REBALANCE_H + +#include "rebalance_types.h" static inline void rebalance_wakeup(struct bch_fs *c) { struct task_struct *p; rcu_read_lock(); - p = rcu_dereference(c->rebalance_thread); + p = rcu_dereference(c->rebalance.thread); if (p) wake_up_process(p); rcu_read_unlock(); @@ -16,8 +18,10 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, struct bch_io_opts *); void bch2_rebalance_add_work(struct bch_fs *, u64); +ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); + void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); void bch2_fs_rebalance_init(struct bch_fs *); -#endif /* _BCACHEFS_TIER_H */ +#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h new file mode 100644 index 00000000..aaf5b9ca --- /dev/null +++ b/libbcachefs/rebalance_types.h @@ -0,0 +1,26 @@ +#ifndef _BCACHEFS_REBALANCE_TYPES_H +#define _BCACHEFS_REBALANCE_TYPES_H + +#include "move_types.h" + +enum rebalance_state { + REBALANCE_WAITING, + REBALANCE_THROTTLED, + REBALANCE_RUNNING, +}; + +struct bch_fs_rebalance { + struct task_struct __rcu *thread; + struct bch_pd_controller pd; + + atomic64_t work_unknown_dev; + + enum rebalance_state state; + unsigned long throttled_until_iotime; + unsigned long throttled_until_cputime; + struct bch_move_stats move_stats; + + unsigned enabled:1; +}; + +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/libbcachefs/six.c b/libbcachefs/six.c index f0ff8d41..afa59a47 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -146,6 +146,8 @@ struct six_lock_waiter { /* This is probably up there with the more evil things I've done */ #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) +#ifdef CONFIG_LOCK_SPIN_ON_OWNER + static inline int six_can_spin_on_owner(struct six_lock *lock) { struct task_struct *owner; @@ -257,6 +259,15 @@ fail: return false; } +#else /* CONFIG_LOCK_SPIN_ON_OWNER */ + +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + return false; +} + +#endif + noinline static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type) { diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index a2b981a3..9772d597 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -624,7 +624,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); bio->bi_iter.bi_size = - roundup(vstruct_bytes(sb), + roundup((size_t) vstruct_bytes(sb), bdev_logical_block_size(ca->disk_sb.bdev)); bio->bi_end_io = write_super_endio; bio->bi_private = ca; diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index f407c205..995b1c90 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -73,11 +73,6 @@ static inline __u64 jset_magic(struct bch_fs *c) return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); } -static inline __u64 pset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC); -} - static inline __u64 bset_magic(struct bch_fs *c) { return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); @@ -136,4 +131,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) }; } +size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *, + struct bch_sb_field *); + #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 16b8cbfc..55da242c 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -33,11 +33,11 @@ #include "migrate.h" #include "movinggc.h" #include "quota.h" +#include "rebalance.h" #include "replicas.h" #include "super.h" #include "super-io.h" #include "sysfs.h" -#include "tier.h" #include #include @@ -398,10 +398,10 @@ err: static void bch2_fs_free(struct bch_fs *c) { -#define BCH_TIME_STAT(name) \ - bch2_time_stats_exit(&c->name##_time); - BCH_TIME_STATS() -#undef BCH_TIME_STAT + unsigned i; + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); @@ -565,10 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); -#define BCH_TIME_STAT(name) \ - bch2_time_stats_init(&c->name##_time); - BCH_TIME_STATS() -#undef BCH_TIME_STAT + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); bch2_fs_allocator_init(c); bch2_fs_rebalance_init(c); @@ -592,14 +590,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->gc_pos_lock); c->copy_gc_enabled = 1; - c->rebalance_enabled = 1; - c->rebalance_percent = 10; + c->rebalance.enabled = 1; c->promote_whole_extents = true; - c->journal.write_time = &c->journal_write_time; - c->journal.delay_time = &c->journal_delay_time; - c->journal.blocked_time = &c->journal_blocked_time; - c->journal.flush_seq_time = &c->journal_flush_seq_time; + c->journal.write_time = &c->times[BCH_TIME_journal_write]; + c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; + c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked]; + c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); @@ -647,7 +644,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || lg_lock_init(&c->usage_lock) || - mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 65345d80..5e341a71 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -24,9 +24,9 @@ #include "keylist.h" #include "move.h" #include "opts.h" +#include "rebalance.h" #include "replicas.h" #include "super-io.h" -#include "tier.h" #include #include @@ -183,8 +183,8 @@ rw_attribute(copy_gc_enabled); sysfs_pd_controller_attribute(copy_gc); rw_attribute(rebalance_enabled); -rw_attribute(rebalance_percent); sysfs_pd_controller_attribute(rebalance); +read_attribute(rebalance_work); rw_attribute(promote_whole_extents); rw_attribute(pd_controllers_update_seconds); @@ -198,11 +198,11 @@ read_attribute(data_replicas_have); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM -#define BCH_TIME_STAT(_name) \ +#define x(_name) \ static struct attribute sysfs_time_stat_##_name = \ { .name = #_name, .mode = S_IRUGO }; BCH_TIME_STATS() -#undef BCH_TIME_STAT +#undef x static struct attribute sysfs_state_rw = { .name = "state", @@ -340,9 +340,11 @@ SHOW(bch2_fs) sysfs_print(pd_controllers_update_seconds, c->pd_controllers_update_seconds); - sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled); - sysfs_print(rebalance_percent, c->rebalance_percent); - sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */ + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + + if (attr == &sysfs_rebalance_work) + return bch2_rebalance_work_show(c, buf); sysfs_print(promote_whole_extents, c->promote_whole_extents); @@ -404,7 +406,7 @@ STORE(__bch2_fs) } if (attr == &sysfs_rebalance_enabled) { - ssize_t ret = strtoul_safe(buf, c->rebalance_enabled) + ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) ?: (ssize_t) size; rebalance_wakeup(c); @@ -413,9 +415,7 @@ STORE(__bch2_fs) sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); - - sysfs_strtoul(rebalance_percent, c->rebalance_percent); - sysfs_pd_controller_store(rebalance, &c->rebalance_pd); + sysfs_pd_controller_store(rebalance, &c->rebalance.pd); sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); @@ -474,7 +474,6 @@ struct attribute *bch2_fs_files[] = { &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, - &sysfs_rebalance_percent, &sysfs_promote_whole_extents, &sysfs_compression_stats, @@ -513,8 +512,11 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_prune_cache, &sysfs_copy_gc_enabled, + &sysfs_rebalance_enabled, + &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), + &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -613,11 +615,12 @@ SHOW(bch2_fs_time_stats) { struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -#define BCH_TIME_STAT(name) \ +#define x(name) \ if (attr == &sysfs_time_stat_##name) \ - return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE); + return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ + buf, PAGE_SIZE); BCH_TIME_STATS() -#undef BCH_TIME_STAT +#undef x return 0; } @@ -629,10 +632,10 @@ STORE(bch2_fs_time_stats) SYSFS_OPS(bch2_fs_time_stats); struct attribute *bch2_fs_time_stats_files[] = { -#define BCH_TIME_STAT(name) \ +#define x(name) \ &sysfs_time_stat_##name, BCH_TIME_STATS() -#undef BCH_TIME_STAT +#undef x NULL }; diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c deleted file mode 100644 index a15a0fa9..00000000 --- a/libbcachefs/tier.c +++ /dev/null @@ -1,259 +0,0 @@ - -#include "bcachefs.h" -#include "alloc.h" -#include "btree_iter.h" -#include "buckets.h" -#include "clock.h" -#include "disk_groups.h" -#include "extents.h" -#include "io.h" -#include "move.h" -#include "super-io.h" -#include "tier.h" - -#include -#include -#include -#include - -static inline bool rebalance_ptr_pred(struct bch_fs *c, - const struct bch_extent_ptr *ptr, - struct bch_extent_crc_unpacked crc, - struct bch_io_opts *io_opts) -{ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - - if (io_opts->background_target && - !dev_in_target(ca, io_opts->background_target) && - !ptr->cached) - return true; - - if (io_opts->background_compression && - crc.compression_type != - bch2_compression_opt_to_type[io_opts->background_compression]) - return true; - - return false; -} - -void bch2_rebalance_add_key(struct bch_fs *c, - struct bkey_s_c k, - struct bch_io_opts *io_opts) -{ - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - struct bkey_s_c_extent e; - - if (!bkey_extent_is_data(k.k)) - return; - - if (!io_opts->background_target && - !io_opts->background_compression) - return; - - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_crc(e, ptr, crc) - if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - - if (!atomic64_add_return(crc.compressed_size, - &ca->rebalance_work)) - rebalance_wakeup(c); - } -} - -void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -{ - if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev)) - rebalance_wakeup(c); -} - -static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return DATA_SKIP; - - extent_for_each_ptr_crc(e, ptr, crc) - if (rebalance_ptr_pred(c, ptr, crc, io_opts)) - goto found; - - return DATA_SKIP; -found: - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; -} - -struct rebalance_work { - unsigned dev_most_full_percent; - u64 dev_most_full_work; - u64 dev_most_full_capacity; - u64 total_work; -}; - -static struct rebalance_work rebalance_work(struct bch_fs *c) -{ - struct bch_dev *ca; - struct rebalance_work ret = { 0 }; - unsigned i; - - for_each_online_member(ca, c, i) { - u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket); - u64 work = atomic64_read(&ca->rebalance_work) + - atomic64_read(&c->rebalance_work_unknown_dev); - unsigned percent_full = div_u64(work * 100, capacity); - - if (percent_full > ret.dev_most_full_percent) { - ret.dev_most_full_percent = percent_full; - ret.dev_most_full_work = work; - ret.dev_most_full_capacity = capacity; - } - - ret.total_work += atomic64_read(&ca->rebalance_work); - } - - ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev); - - return ret; -} - -static void rebalance_work_reset(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) - atomic64_set(&ca->rebalance_work, 0); - - atomic64_set(&c->rebalance_work_unknown_dev, 0); -} - -static unsigned long curr_cputime(void) -{ - u64 utime, stime; - - task_cputime_adjusted(current, &utime, &stime); - return nsecs_to_jiffies(utime + stime); -} - -static int bch2_rebalance_thread(void *arg) -{ - struct bch_fs *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - struct rebalance_work w, p; - unsigned long start, prev_start; - unsigned long prev_run_time, prev_run_cputime; - unsigned long cputime, prev_cputime; - - set_freezable(); - - p = rebalance_work(c); - prev_start = jiffies; - prev_cputime = curr_cputime(); - - while (!kthread_wait_freezable(c->rebalance_enabled)) { - struct bch_move_stats move_stats = { 0 }; - - w = rebalance_work(c); - start = jiffies; - cputime = curr_cputime(); - - prev_run_time = start - prev_start; - prev_run_cputime = cputime - prev_cputime; - - if (!w.total_work) { - kthread_wait_freezable(rebalance_work(c).total_work); - continue; - } - - if (w.dev_most_full_percent < 20 && - prev_run_cputime * 5 > prev_run_time) { - if (w.dev_most_full_capacity) { - bch2_kthread_io_clock_wait(clock, - atomic_long_read(&clock->now) + - div_u64(w.dev_most_full_capacity, 5)); - } else { - - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - - schedule_timeout(prev_run_cputime * 5 - - prev_run_time); - continue; - } - } - - /* minimum 1 mb/sec: */ - c->rebalance_pd.rate.rate = - max_t(u64, 1 << 11, - c->rebalance_pd.rate.rate * - max(p.dev_most_full_percent, 1U) / - max(w.dev_most_full_percent, 1U)); - - rebalance_work_reset(c); - - bch2_move_data(c, &c->rebalance_pd.rate, - writepoint_ptr(&c->rebalance_write_point), - POS_MIN, POS_MAX, - rebalance_pred, NULL, - &move_stats); - } - - return 0; -} - -void bch2_rebalance_stop(struct bch_fs *c) -{ - struct task_struct *p; - - c->rebalance_pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&c->rebalance_pd.rate); - - p = c->rebalance_thread; - c->rebalance_thread = NULL; - - if (p) { - /* for sychronizing with rebalance_wakeup() */ - synchronize_rcu(); - - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_rebalance_start(struct bch_fs *c) -{ - struct task_struct *p; - - if (c->opts.nochanges) - return 0; - - p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); - if (IS_ERR(p)) - return PTR_ERR(p); - - get_task_struct(p); - - rcu_assign_pointer(c->rebalance_thread, p); - wake_up_process(c->rebalance_thread); - return 0; -} - -void bch2_fs_rebalance_init(struct bch_fs *c) -{ - bch2_pd_controller_init(&c->rebalance_pd); - - atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX); -} diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 1f2c23b9..60e1f1ff 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -203,7 +203,7 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_quantiles_update(struct quantiles *q, u64 v) +static void bch2_quantiles_update(struct quantiles *q, u64 v) { unsigned i = 0; @@ -569,6 +569,23 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } +int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} + size_t bch2_rand_range(size_t max) { size_t rand; @@ -771,20 +788,28 @@ void sort_cmp_size(void *base, size_t num, size_t size, } } -void mempool_free_vp(void *element, void *pool_data) +static void mempool_free_vp(void *element, void *pool_data) { size_t size = (size_t) pool_data; vpfree(element, size); } -void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) +static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t) pool_data; return vpmalloc(size, gfp_mask); } +int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return size < PAGE_SIZE + ? mempool_init_kmalloc_pool(pool, min_nr, size) + : mempool_init(pool, min_nr, mempool_alloc_vp, + mempool_free_vp, (void *) size); +} + #if 0 void eytzinger1_test(void) { diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 7c7264f4..18491559 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -68,9 +68,9 @@ struct closure; #define __flatten #endif -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define CPU_BIG_ENDIAN 0 -#else +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define CPU_BIG_ENDIAN 1 #endif @@ -113,14 +113,7 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) : vpmalloc(size, gfp_mask); } -void mempool_free_vp(void *element, void *pool_data); -void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data); - -static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size) -{ - return mempool_init(pool, min_nr, mempool_alloc_vp, - mempool_free_vp, (void *) size); -} +int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); #define HEAP(type) \ struct { \ @@ -610,6 +603,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch2_bio_map(struct bio *bio, void *base); +int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); static inline sector_t bdev_sectors(struct block_device *bdev) { diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 79a98f75..c89c7200 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -5,8 +5,8 @@ #include "compress.h" #include "extents.h" #include "fs.h" +#include "rebalance.h" #include "str_hash.h" -#include "tier.h" #include "xattr.h" #include diff --git a/linux/sched.c b/linux/sched.c index 2d61c480..de6eb142 100644 --- a/linux/sched.c +++ b/linux/sched.c @@ -40,14 +40,22 @@ void schedule(void) v, NULL, NULL, 0); } -static void process_timeout(unsigned long __data) +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) { - wake_up_process((struct task_struct *)__data); + struct process_timer *timeout = + container_of(t, struct process_timer, timer); + + wake_up_process(timeout->task); } long schedule_timeout(long timeout) { - struct timer_list timer; + struct process_timer timer; unsigned long expire; switch (timeout) @@ -80,10 +88,11 @@ long schedule_timeout(long timeout) expire = timeout + jiffies; - setup_timer(&timer, process_timeout, (unsigned long)current); - mod_timer(&timer, expire); + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + mod_timer(&timer.timer, expire); schedule(); - del_timer_sync(&timer); + del_timer_sync(&timer.timer); timeout = expire - jiffies; out: diff --git a/linux/timer.c b/linux/timer.c index b67a54ac..dd5aba18 100644 --- a/linux/timer.c +++ b/linux/timer.c @@ -273,7 +273,7 @@ static int timer_thread(void *arg) BUG_ON(!timer_running()); pthread_mutex_unlock(&timer_lock); - timer->function(timer->data); + timer->function(timer); pthread_mutex_lock(&timer_lock); timer_seq++; diff --git a/linux/workqueue.c b/linux/workqueue.c index f5942772..4dfd6cd9 100644 --- a/linux/workqueue.c +++ b/linux/workqueue.c @@ -55,9 +55,10 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work) return ret; } -void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(struct timer_list *timer) { - struct delayed_work *dwork = (struct delayed_work *) __data; + struct delayed_work *dwork = + container_of(timer, struct delayed_work, timer); pthread_mutex_lock(&wq_lock); __queue_work(dwork->wq, &dwork->work); @@ -71,8 +72,7 @@ static void __queue_delayed_work(struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - BUG_ON(timer->function != delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + BUG_ON(timer->function != delayed_work_timer_fn); BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry));