From 4de98a2712764bceb9e0f67b1ac2f2c7862feb77 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Jan 2018 06:41:59 -0500 Subject: [PATCH] Update bcachefs sources to 02ae70070a bcachefs: Allocate new btree roots lazily --- .bcachefs_revision | 2 +- cmd_debug.c | 16 +- cmd_device.c | 8 +- cmd_format.c | 8 +- cmd_fsck.c | 8 +- cmd_key.c | 26 +- cmd_migrate.c | 18 +- include/linux/generic-radix-tree.h | 10 +- libbcachefs.c | 5 + libbcachefs/alloc.c | 409 ++++++++++----- libbcachefs/alloc.h | 1 + libbcachefs/bcachefs.h | 26 +- libbcachefs/bcachefs_format.h | 64 ++- libbcachefs/bkey.h | 9 +- libbcachefs/bkey_methods.c | 2 + libbcachefs/bset.c | 3 - libbcachefs/btree_cache.h | 4 +- libbcachefs/btree_gc.c | 28 +- libbcachefs/btree_io.c | 110 ++-- libbcachefs/btree_io.h | 8 + libbcachefs/btree_iter.c | 25 +- libbcachefs/btree_iter.h | 4 +- libbcachefs/btree_locking.h | 3 + libbcachefs/btree_types.h | 2 + libbcachefs/btree_update_interior.c | 65 +-- libbcachefs/btree_update_interior.h | 5 +- libbcachefs/buckets.c | 48 +- libbcachefs/buckets.h | 8 +- libbcachefs/buckets_types.h | 3 +- libbcachefs/chardev.c | 14 +- libbcachefs/debug.c | 2 +- libbcachefs/error.h | 3 - libbcachefs/extents.c | 4 +- libbcachefs/extents.h | 11 + libbcachefs/fifo.h | 1 + libbcachefs/fs-io.c | 298 +++++------ libbcachefs/fs-ioctl.c | 28 + libbcachefs/fs.c | 236 ++++++--- libbcachefs/fs.h | 2 + libbcachefs/fsck.c | 46 +- libbcachefs/io.c | 3 +- libbcachefs/io_types.h | 6 +- libbcachefs/journal.c | 321 ++++++------ libbcachefs/journal.h | 4 +- libbcachefs/journal_types.h | 6 - libbcachefs/migrate.c | 197 ++----- libbcachefs/move.c | 81 ++- libbcachefs/move.h | 12 +- libbcachefs/movinggc.c | 7 +- libbcachefs/opts.c | 32 +- libbcachefs/opts.h | 11 +- libbcachefs/quota.c | 786 ++++++++++++++++++++++++++++ libbcachefs/quota.h | 48 ++ libbcachefs/quota_types.h | 36 ++ libbcachefs/super-io.c | 219 ++++---- libbcachefs/super-io.h | 15 +- libbcachefs/super.c | 218 ++++---- libbcachefs/super.h | 3 +- libbcachefs/tier.c | 6 +- linux/kthread.c | 2 + 60 files changed, 2353 insertions(+), 1233 deletions(-) create mode 100644 libbcachefs/quota.c create mode 100644 libbcachefs/quota.h create mode 100644 libbcachefs/quota_types.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 699d6f22..92bf9ad4 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -0b8c5d0fb7b5de6fb99030565cd2d0411da37f2b +02ae70070acc3bc4740d221efa5ff5425cf6fce5 diff --git a/cmd_debug.c b/cmd_debug.c index 1a2c1dbd..6e395bab 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -80,9 +80,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) int cmd_dump(int argc, char *argv[]) { struct bch_opts opts = bch2_opts_empty(); - struct bch_fs *c = NULL; struct bch_dev *ca; - const char *err; char *out = NULL; unsigned i, nr_devices = 0; bool force = false; @@ -112,9 +110,9 @@ int cmd_dump(int argc, char *argv[]) if (!out) die("Please supply output filename"); - err = bch2_fs_open(argv + optind, argc - optind, opts, &c); - if (err) - die("error opening %s: %s", argv[optind], err); + struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c))); down_read(&c->gc_lock); @@ -258,10 +256,8 @@ static const char * const list_modes[] = { int cmd_list(int argc, char *argv[]) { struct bch_opts opts = bch2_opts_empty(); - struct bch_fs *c = NULL; enum btree_id btree_id = BTREE_ID_EXTENTS; struct bpos start = POS_MIN, end = POS_MAX; - const char *err; u64 inum; int mode = 0, opt; @@ -307,9 +303,9 @@ int cmd_list(int argc, char *argv[]) if (optind >= argc) die("Please supply device(s) to check"); - err = bch2_fs_open(argv + optind, argc - optind, opts, &c); - if (err) - die("error opening %s: %s", argv[optind], err); + struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c))); switch (mode) { case 0: diff --git a/cmd_device.c b/cmd_device.c index 22ab016f..390c48ad 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -528,11 +528,9 @@ int cmd_device_resize(int argc, char *argv[]) } else { printf("Doing offline resize of %s\n", dev); - struct bch_fs *c = NULL; - struct bch_opts opts = bch2_opts_empty(); - const char *err = bch2_fs_open(&dev, 1, opts, &c); - if (err) - die("error opening %s: %s", dev, err); + struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty()); + if (IS_ERR(c)) + die("error opening %s: %s", dev, strerror(-PTR_ERR(c))); struct bch_dev *ca, *resize = NULL; unsigned i; diff --git a/cmd_format.c b/cmd_format.c index 47617660..42e8d1a6 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -328,11 +328,11 @@ int cmd_show_super(int argc, char *argv[]) if (argc) die("too many arguments"); - const char *err; + struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; - err = bch2_read_super(dev, bch2_opts_empty(), &sb); - if (err) - die("Error opening %s: %s", dev, err); + int ret = bch2_read_super(dev, &opts, &sb); + if (ret) + die("Error opening %s: %s", dev, strerror(-ret)); bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE); bch2_free_super(&sb); diff --git a/cmd_fsck.c b/cmd_fsck.c index 556a4e1b..6f873b1f 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -23,8 +23,6 @@ static void usage(void) int cmd_fsck(int argc, char *argv[]) { struct bch_opts opts = bch2_opts_empty(); - struct bch_fs *c = NULL; - const char *err; int opt; opt_set(opts, degraded, true); @@ -56,9 +54,9 @@ int cmd_fsck(int argc, char *argv[]) if (optind >= argc) die("Please supply device(s) to check"); - err = bch2_fs_open(argv + optind, argc - optind, opts, &c); - if (err) - die("error opening %s: %s", argv[optind], err); + struct bch_fs *c = bch2_fs_open(argv + optind, argc - optind, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[optind], strerror(-PTR_ERR(c))); bch2_fs_stop(c); return 0; diff --git a/cmd_key.c b/cmd_key.c index 879163f1..e670b508 100644 --- a/cmd_key.c +++ b/cmd_key.c @@ -9,16 +9,16 @@ int cmd_unlock(int argc, char *argv[]) { + struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; - const char *err; char *passphrase; if (argc != 2) die("Please supply a single device"); - err = bch2_read_super(argv[1], bch2_opts_empty(), &sb); - if (err) - die("Error opening %s: %s", argv[1], err); + int ret = bch2_read_super(argv[1], &opts, &sb); + if (ret) + die("Error opening %s: %s", argv[1], strerror(-ret)); passphrase = read_passphrase("Enter passphrase: "); @@ -32,16 +32,15 @@ int cmd_unlock(int argc, char *argv[]) int cmd_set_passphrase(int argc, char *argv[]) { struct bch_opts opts = bch2_opts_empty(); - struct bch_fs *c = NULL; - const char *err; + struct bch_fs *c; if (argc < 2) die("Please supply one or more devices"); opt_set(opts, nostart, true); - err = bch2_fs_open(argv + 1, argc - 1, opts, &c); - if (err) - die("Error opening %s: %s", argv[1], err); + c = bch2_fs_open(argv + 1, argc - 1, opts); + if (IS_ERR(c)) + die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c))); struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb); if (!crypt) @@ -70,16 +69,15 @@ int cmd_set_passphrase(int argc, char *argv[]) int cmd_remove_passphrase(int argc, char *argv[]) { struct bch_opts opts = bch2_opts_empty(); - struct bch_fs *c = NULL; - const char *err; + struct bch_fs *c; if (argc < 2) die("Please supply one or more devices"); opt_set(opts, nostart, true); - err = bch2_fs_open(argv + 1, argc - 1, opts, &c); - if (err) - die("Error opening %s: %s", argv[1], err); + c = bch2_fs_open(argv + 1, argc - 1, opts); + if (IS_ERR(c)) + die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c))); struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb); if (!crypt) diff --git a/cmd_migrate.c b/cmd_migrate.c index d82fee6d..1c449554 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -334,7 +334,8 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_check_mark_super(c, extent_i_to_s_c(e), false); + bch2_check_mark_super(c, BCH_DATA_USER, + bch2_bkey_devs(extent_i_to_s_c(e).s_c)); ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i, &res, NULL, NULL, 0); @@ -734,19 +735,18 @@ int cmd_migrate(int argc, char *argv[]) struct bch_opts opts = bch2_opts_empty(); struct bch_fs *c = NULL; char *path[1] = { dev.path }; - const char *err; opt_set(opts, sb, sb_offset); opt_set(opts, nostart, true); opt_set(opts, noexcl, true); - err = bch2_fs_open(path, 1, opts, &c); - if (err) - die("Error opening new filesystem: %s", err); + c = bch2_fs_open(path, 1, opts); + if (IS_ERR(c)) + die("Error opening new filesystem: %s", strerror(-PTR_ERR(c))); mark_unreserved_space(c, extents); - err = bch2_fs_start(c); + const char *err = bch2_fs_start(c); if (err) die("Error starting new filesystem: %s", err); @@ -758,9 +758,9 @@ int cmd_migrate(int argc, char *argv[]) opt_set(opts, nostart, false); opt_set(opts, nochanges, true); - err = bch2_fs_open(path, 1, opts, &c); - if (err) - die("Error opening new filesystem: %s", err); + c = bch2_fs_open(path, 1, opts); + if (IS_ERR(c)) + die("Error opening new filesystem: %s", strerror(-PTR_ERR(c))); bch2_fs_stop(c); printf("fsck complete\n"); diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 6ea2deb2..7f637e17 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -99,11 +99,11 @@ struct genradix_iter { size_t pos; }; -static inline void genradix_iter_init(struct genradix_iter *iter) -{ - iter->offset = 0; - iter->pos = 0; -} +#define genradix_iter_init(_radix, _idx) \ + ((struct genradix_iter) { \ + .pos = (_idx), \ + .offset = __genradix_idx_to_offset((_radix), (_idx)),\ + }) void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); diff --git a/libbcachefs.c b/libbcachefs.c index 1481ef38..3632e30d 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -454,6 +454,11 @@ static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f, } } +static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f, + enum units units) +{ +} + typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units); struct bch_sb_field_ops { diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index ec02adc0..f7ff8027 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -55,6 +55,8 @@ #include "bcachefs.h" #include "alloc.h" +#include "btree_cache.h" +#include "btree_io.h" #include "btree_update.h" #include "btree_gc.h" #include "buckets.h" @@ -290,9 +292,6 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) unsigned i; int ret; - if (!c->btree_roots[BTREE_ID_ALLOC].b) - return 0; - for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) { bch2_alloc_read_key(c, k); bch2_btree_iter_cond_resched(&iter); @@ -401,7 +400,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) return ret; } -static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) +static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca) { struct btree_iter iter; unsigned long bucket; @@ -412,7 +411,7 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_s down_read(&ca->bucket_lock); for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, bucket, &iter, journal_seq); + ret = __bch2_alloc_write_key(c, ca, bucket, &iter, NULL); if (ret) break; @@ -537,7 +536,8 @@ static void bch2_prio_timer_init(struct bch_fs *c, int rw) static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { - if (expensive_debug_checks(c)) { + if (expensive_debug_checks(c) && + test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { size_t iter; long i; unsigned j; @@ -692,7 +692,7 @@ static inline int bucket_alloc_cmp(alloc_heap *h, return (l.key > r.key) - (l.key < r.key); } -static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) +static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e; @@ -740,7 +740,7 @@ static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) bch2_invalidate_one_bucket(c, ca, e.bucket); } -static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) +static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; @@ -762,7 +762,7 @@ static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) } } -static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca) +static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; @@ -782,21 +782,21 @@ static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca) } } -static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) +static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) { ca->inc_gen_needs_gc = 0; ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: - invalidate_buckets_lru(c, ca); - break; - case CACHE_REPLACEMENT_FIFO: - invalidate_buckets_fifo(c, ca); - break; - case CACHE_REPLACEMENT_RANDOM: - invalidate_buckets_random(c, ca); - break; + case CACHE_REPLACEMENT_LRU: + find_reclaimable_buckets_lru(c, ca); + break; + case CACHE_REPLACEMENT_FIFO: + find_reclaimable_buckets_fifo(c, ca); + break; + case CACHE_REPLACEMENT_RANDOM: + find_reclaimable_buckets_random(c, ca); + break; } } @@ -807,79 +807,119 @@ static int size_t_cmp(const void *_l, const void *_r) return (*l > *r) - (*l < *r); } +static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca) +{ + BUG_ON(ca->free_inc.front); + + spin_lock(&c->freelist_lock); + sort(ca->free_inc.data, + ca->free_inc.back, + sizeof(ca->free_inc.data[0]), + size_t_cmp, NULL); + spin_unlock(&c->freelist_lock); +} + static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq) + u64 *journal_seq, size_t nr) { struct btree_iter iter; - unsigned nr_invalidated = 0; - size_t b, i; int ret = 0; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), BTREE_ITER_INTENT); - fifo_for_each_entry(b, &ca->free_inc, i) { + /* + * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the + * btree update or journal_res_get + */ + while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) { + size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated); + ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq); if (ret) break; - nr_invalidated++; + ca->nr_invalidated++; } bch2_btree_iter_unlock(&iter); - return nr_invalidated ?: ret; + return ret; } -/* - * Given an invalidated, ready to use bucket: issue a discard to it if enabled, - * then add it to the freelist, waiting until there's room if necessary: - */ -static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, - long bucket) +static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); + unsigned i; + + /* + * Don't remove from free_inc until after it's added to + * freelist, so gc can find it: + */ + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + --ca->nr_invalidated; + closure_wake_up(&c->freelist_wait); + spin_unlock(&c->freelist_lock); + return true; + } + spin_unlock(&c->freelist_lock); + + return false; +} + +static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) +{ + int ret = 0; while (1) { - bool pushed = false; - unsigned i; - set_current_state(TASK_INTERRUPTIBLE); - /* - * Don't remove from free_inc until after it's added to - * freelist, so gc can find it: - */ - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - if (fifo_push(&ca->free[i], bucket)) { - fifo_pop(&ca->free_inc, bucket); - closure_wake_up(&c->freelist_wait); - pushed = true; - break; - } - spin_unlock(&c->freelist_lock); - - if (pushed) + if (__push_invalidated_bucket(c, ca, bucket)) break; - if (kthread_should_stop()) + if ((current->flags & PF_KTHREAD) && + kthread_should_stop()) { + ret = -1; break; + } schedule(); try_to_freeze(); } __set_current_state(TASK_RUNNING); + return ret; +} + +/* + * Given an invalidated, ready to use bucket: issue a discard to it if enabled, + * then add it to the freelist, waiting until there's room if necessary: + */ +static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) +{ + while (ca->nr_invalidated) { + size_t bucket = fifo_peek(&ca->free_inc); + + BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated); + + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bucket), + ca->mi.bucket_size, GFP_NOIO, 0); + + if (push_invalidated_bucket(c, ca, bucket)) + return -1; + } + + return 0; } /** * bch_allocator_thread - move buckets from free_inc to reserves * - * The free_inc FIFO is populated by invalidate_buckets(), and + * The free_inc FIFO is populated by find_reclaimable_buckets(), and * the reserves are depleted by bucket allocation. When we run out * of free_inc, try to invalidate some buckets and write out * prios and gens. @@ -889,43 +929,36 @@ static int bch2_allocator_thread(void *arg) struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; u64 journal_seq; - size_t bucket; int ret; set_freezable(); while (1) { while (1) { - while (ca->nr_invalidated) { - BUG_ON(fifo_empty(&ca->free_inc)); - - bucket = fifo_peek(&ca->free_inc); - discard_invalidated_bucket(c, ca, bucket); - if (kthread_should_stop()) - return 0; - --ca->nr_invalidated; - } + ret = discard_invalidated_buckets(c, ca); + if (ret) + return 0; if (fifo_empty(&ca->free_inc)) break; journal_seq = 0; - ret = bch2_invalidate_free_inc(c, ca, &journal_seq); - if (ret < 0) + ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX); + if (ret) return 0; - ca->nr_invalidated = ret; - - if (ca->nr_invalidated == fifo_used(&ca->free_inc)) { - ca->alloc_thread_started = true; - bch2_alloc_write(c, ca, &journal_seq); - } - if (ca->allocator_invalidating_data) - bch2_journal_flush_seq(&c->journal, journal_seq); + ret = bch2_journal_flush_seq(&c->journal, journal_seq); else if (ca->allocator_journal_seq_flush) - bch2_journal_flush_seq(&c->journal, + ret = bch2_journal_flush_seq(&c->journal, ca->allocator_journal_seq_flush); + + /* + * journal error - buckets haven't actually been + * invalidated, can't discard them: + */ + if (ret) + return 0; } /* Reset front/back so we can easily sort fifo entries later: */ @@ -947,7 +980,7 @@ static int bch2_allocator_thread(void *arg) * another cache tier */ - invalidate_buckets(c, ca); + find_reclaimable_buckets(c, ca); trace_alloc_batch(ca, fifo_used(&ca->free_inc), ca->free_inc.size); @@ -970,14 +1003,7 @@ static int bch2_allocator_thread(void *arg) } up_read(&c->gc_lock); - BUG_ON(ca->free_inc.front); - - spin_lock(&c->freelist_lock); - sort(ca->free_inc.data, - ca->free_inc.back, - sizeof(ca->free_inc.data[0]), - size_t_cmp, NULL); - spin_unlock(&c->freelist_lock); + sort_free_inc(c, ca); /* * free_inc is now full of newly-invalidated buckets: next, @@ -1037,51 +1063,27 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } -/* - * XXX: allocation on startup is still sketchy. There is insufficient - * synchronization for bch2_bucket_alloc_startup() to work correctly after - * bch2_alloc_write() has been called, and we aren't currently doing anything - * to guarantee that this won't happen. - * - * Even aside from that, it's really difficult to avoid situations where on - * startup we write out a pointer to a freshly allocated bucket before the - * corresponding gen - when we're still digging ourself out of the "i need to - * allocate to write bucket gens, but i need to write bucket gens to allocate" - * hole. - * - * Fortunately, bch2_btree_mark_key_initial() will detect and repair this - * easily enough... - */ -static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) +/* _only_ for allocating the journal and btree roots on a brand new fs: */ +int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; ssize_t b; - if (!down_read_trylock(&c->gc_lock)) - return -1; - - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { - up_read(&c->gc_lock); - return -1; - } - - spin_unlock(&c->freelist_lock); - - down_read(&ca->bucket_lock); + rcu_read_lock(); buckets = bucket_array(ca); - spin_lock(&c->freelist_lock); - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) - if (is_startup_available_bucket(buckets->b[b].mark) && - bch2_mark_alloc_bucket_startup(c, ca, b)) { + if (is_available_bucket(buckets->b[b].mark)) { + bch2_mark_alloc_bucket(c, ca, b, true, + gc_pos_alloc(c, NULL), + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); set_bit(b, ca->buckets_dirty); goto success; } b = -1; success: - up_read(&ca->bucket_lock); - up_read(&c->gc_lock); + rcu_read_unlock(); return b; } @@ -1150,8 +1152,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, break; } - if (unlikely(!ca->alloc_thread_started) && - (reserve == RESERVE_ALLOC) && + if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) && (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0) goto out; @@ -1858,6 +1859,172 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; } +static int __bch2_fs_allocator_start(struct bch_fs *c) +{ + struct bch_dev *ca; + size_t bu, i, devs_have_enough = 0; + unsigned dev_iter; + u64 journal_seq = 0; + bool invalidating_data = false; + int ret = 0; + + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) + return -1; + + /* Scan for buckets that are already invalidated: */ + for_each_rw_member(ca, c, dev_iter) { + struct btree_iter iter; + struct bucket_mark m; + struct bkey_s_c k; + + for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) { + if (k.k->type != BCH_ALLOC) + continue; + + bu = k.k->p.offset; + m = READ_ONCE(bucket(ca, bu)->mark); + + if (!is_available_bucket(m) || m.cached_sectors) + continue; + + bch2_mark_alloc_bucket(c, ca, bu, true, + gc_pos_alloc(c, NULL), + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); + + fifo_push(&ca->free_inc, bu); + ca->nr_invalidated++; + + if (fifo_full(&ca->free_inc)) + break; + } + bch2_btree_iter_unlock(&iter); + } + + /* did we find enough buckets? */ + for_each_rw_member(ca, c, dev_iter) + devs_have_enough += (fifo_used(&ca->free_inc) >= + ca->free[RESERVE_BTREE].size); + + if (devs_have_enough >= c->opts.metadata_replicas) + return 0; + + /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */ + for_each_rw_member(ca, c, dev_iter) + discard_invalidated_buckets(c, ca); + + for_each_rw_member(ca, c, dev_iter) { + BUG_ON(!fifo_empty(&ca->free_inc)); + ca->free_inc.front = ca->free_inc.back = 0; + + find_reclaimable_buckets(c, ca); + sort_free_inc(c, ca); + + invalidating_data |= ca->allocator_invalidating_data; + + fifo_for_each_entry(bu, &ca->free_inc, i) + if (!fifo_push(&ca->free[RESERVE_BTREE], bu)) + break; + } + + /* + * We're moving buckets to freelists _before_ they've been marked as + * invalidated on disk - we have to so that we can allocate new btree + * nodes to mark them as invalidated on disk. + * + * However, we can't _write_ to any of these buckets yet - they might + * have cached data in them, which is live until they're marked as + * invalidated on disk: + */ + if (invalidating_data) + set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); + + /* + * XXX: it's possible for this to deadlock waiting on journal reclaim, + * since we're holding btree writes. What then? + */ + + for_each_rw_member(ca, c, dev_iter) { + ret = bch2_invalidate_free_inc(c, ca, &journal_seq, + ca->free[RESERVE_BTREE].size); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + if (invalidating_data) { + ret = bch2_journal_flush_seq(&c->journal, journal_seq); + if (ret) + return ret; + } + + for_each_rw_member(ca, c, dev_iter) + while (ca->nr_invalidated) { + BUG_ON(!fifo_pop(&ca->free_inc, bu)); + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bu), + ca->mi.bucket_size, GFP_NOIO, 0); + ca->nr_invalidated--; + } + + set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); + + /* now flush dirty btree nodes: */ + if (invalidating_data) { + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + + clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); +again: + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (btree_node_dirty(b) && (!b->written || b->level)) { + rcu_read_unlock(); + six_lock_read(&b->lock); + bch2_btree_node_write(c, b, NULL, SIX_LOCK_read); + six_unlock_read(&b->lock); + goto again; + } + rcu_read_unlock(); + } + + return 0; +} + +int bch2_fs_allocator_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret; + + down_read(&c->gc_lock); + ret = __bch2_fs_allocator_start(c); + up_read(&c->gc_lock); + + if (ret) + return ret; + + for_each_rw_member(ca, c, i) { + ret = bch2_dev_allocator_start(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + for_each_rw_member(ca, c, i) { + ret = bch2_alloc_write(c, ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + return 0; +} + void bch2_fs_allocator_init(struct bch_fs *c) { struct open_bucket *ob; diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index ee771ee1..1b9d960b 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -118,6 +118,7 @@ static inline void writepoint_init(struct write_point *wp, wp->type = type; } +int bch2_fs_allocator_start(struct bch_fs *); void bch2_fs_allocator_init(struct bch_fs *); extern const struct bkey_ops bch2_bkey_alloc_ops; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 02e38410..78c427fa 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -281,11 +281,9 @@ do { \ #include "clock_types.h" #include "journal_types.h" #include "keylist_types.h" +#include "quota_types.h" #include "super_types.h" -/* 256k, in sectors */ -#define BTREE_NODE_SIZE_MAX 512 - /* * Number of nodes we might have to allocate in a worst case btree split * operation - we split all the way up to the root, then allocate a new root. @@ -380,7 +378,6 @@ struct bch_dev { alloc_fifo free_inc; spinlock_t freelist_lock; unsigned nr_invalidated; - bool alloc_thread_started; u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; @@ -423,18 +420,28 @@ struct bch_dev { * won't automatically reattach). */ enum { + /* startup: */ + BCH_FS_BRAND_NEW_FS, BCH_FS_ALLOC_READ_DONE, + BCH_FS_ALLOCATOR_STARTED, BCH_FS_INITIAL_GC_DONE, + BCH_FS_FSCK_DONE, + + /* shutdown: */ BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, BCH_FS_GC_STOPPING, - BCH_FS_GC_FAILURE, - BCH_FS_BDEV_MOUNTED, + + /* errors: */ BCH_FS_ERROR, + BCH_FS_GC_FAILURE, + + /* misc: */ + BCH_FS_BDEV_MOUNTED, BCH_FS_FSCK_FIXED_ERRORS, - BCH_FS_FSCK_DONE, BCH_FS_FIXED_GENS, BCH_FS_REBUILD_REPLICAS, + BCH_FS_HOLD_BTREE_WRITES, }; struct btree_debug { @@ -517,7 +524,7 @@ struct bch_fs { struct mutex sb_lock; /* BTREE CACHE */ - struct bio_set btree_read_bio; + struct bio_set btree_bio; struct btree_root btree_roots[BTREE_ID_NR]; bool btree_roots_dirty; @@ -665,6 +672,9 @@ struct bch_fs { unsigned writeback_pages_max; atomic_long_t nr_inodes; + /* QUOTAS */ + struct bch_memquota_type quotas[QTYP_NR]; + /* DEBUG JUNK */ struct dentry *debug; struct btree_debug btree_debug[BTREE_ID_NR]; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d65b5e66..cb9e450b 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -606,11 +606,13 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BCH_INODE_FIELD(bi_generation, 32) \ BCH_INODE_FIELD(bi_dev, 32) \ BCH_INODE_FIELD(bi_data_checksum, 8) \ - BCH_INODE_FIELD(bi_compression, 8) + BCH_INODE_FIELD(bi_compression, 8) \ + BCH_INODE_FIELD(bi_project, 32) #define BCH_INODE_FIELDS_INHERIT() \ BCH_INODE_FIELD(bi_data_checksum) \ - BCH_INODE_FIELD(bi_compression) + BCH_INODE_FIELD(bi_compression) \ + BCH_INODE_FIELD(bi_project) enum { /* @@ -737,6 +739,36 @@ struct bch_alloc { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(alloc, BCH_ALLOC); +/* Quotas: */ + +enum { + BCH_QUOTA = 128, +}; + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(quota, BCH_QUOTA); + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -749,7 +781,8 @@ struct bch_sb_field { x(journal, 0) \ x(members, 1) \ x(crypt, 2) \ - x(replicas, 3) + x(replicas, 3) \ + x(quota, 4) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -883,6 +916,23 @@ struct bch_sb_field_replicas { struct bch_replicas_entry entries[0]; }; +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __attribute__((packed, aligned(8))); + /* Superblock: */ /* @@ -986,6 +1036,11 @@ LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); +LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); +LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); +LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + +/* 60-64 unused */ LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); @@ -1181,7 +1236,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); DEF_BTREE_ID(INODES, 1, "inodes") \ DEF_BTREE_ID(DIRENTS, 2, "dirents") \ DEF_BTREE_ID(XATTRS, 3, "xattrs") \ - DEF_BTREE_ID(ALLOC, 4, "alloc") + DEF_BTREE_ID(ALLOC, 4, "alloc") \ + DEF_BTREE_ID(QUOTAS, 5, "quotas") #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 89697956..f665e2e1 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -7,6 +7,10 @@ #include "util.h" #include "vstructs.h" +#ifdef CONFIG_X86_64 +#define HAVE_BCACHEFS_COMPILED_UNPACK 1 +#endif + void bch2_to_binary(char *, const u64 *, unsigned); #define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) @@ -381,8 +385,7 @@ static inline u64 bkey_field_max(const struct bkey_format *f, : U64_MAX; } -#ifdef CONFIG_X86_64 -#define HAVE_BCACHEFS_COMPILED_UNPACK 1 +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK int bch2_compile_bkey_format(const struct bkey_format *, void *); @@ -583,6 +586,8 @@ BKEY_VAL_ACCESSORS(xattr, BCH_XATTR); BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC); +BKEY_VAL_ACCESSORS(quota, BCH_QUOTA); + /* byte order helpers */ #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 1736a483..3b3a09eb 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -7,6 +7,7 @@ #include "error.h" #include "extents.h" #include "inode.h" +#include "quota.h" #include "xattr.h" const struct bkey_ops *bch2_bkey_ops[] = { @@ -15,6 +16,7 @@ const struct bkey_ops *bch2_bkey_ops[] = { [BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops, [BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops, [BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops, + [BKEY_TYPE_QUOTAS] = &bch2_bkey_quota_ops, [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops, }; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 10f3f3f3..02be5bb4 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1550,9 +1550,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, __bch2_btree_node_iter_init(iter, is_extents); - //if (bkey_cmp(search, b->curr_max_key) > 0) - // return; - switch (bch2_bkey_pack_pos_lossy(&p, search, b)) { case BKEY_PACK_POS_EXACT: packed_search = &p; diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 46d536eb..e021d6e9 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -45,8 +45,8 @@ static inline bool btree_node_hashed(struct btree *b) } #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ - for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \ - &(_c)->btree_cache_table), \ + for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ + &(_c)->btree_cache.table), \ _iter = 0; _iter < (_tbl)->size; _iter++) \ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 7d1be86f..9f1071e5 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -148,23 +148,24 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, { enum bch_data_type data_type = type == BKEY_TYPE_BTREE ? BCH_DATA_BTREE : BCH_DATA_USER; + struct bch_devs_list devs = bch2_bkey_devs(k); int ret = 0; + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c, + "superblock not marked as containing replicas (type %u)", + data_type)) { + ret = bch2_check_mark_super(c, data_type, devs); + if (ret) + return ret; + } + switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; - if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c, - "superblock not marked as containing replicas (type %u)", - data_type)) { - ret = bch2_check_mark_super(c, e, data_type); - if (ret) - return ret; - } - extent_for_each_ptr(e, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t b = PTR_BUCKET_NR(ca, ptr); @@ -284,7 +285,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; - bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); + if (!btree_node_fake(b)) + bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); mutex_unlock(&c->btree_root_lock); @@ -991,8 +993,10 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) if (!c->btree_roots[id].b) return 0; - ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE, - bkey_i_to_s_c(&c->btree_roots[id].b->key)); + b = c->btree_roots[id].b; + if (!btree_node_fake(b)) + ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&b->key)); if (ret) return ret; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 87a8ddf9..3f87e91e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1352,7 +1352,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, return; } - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; rb->start_time = local_clock(); @@ -1438,9 +1438,9 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) } static void bch2_btree_node_write_error(struct bch_fs *c, - struct bch_write_bio *wbio) + struct btree_write_bio *wbio) { - struct btree *b = wbio->bio.bi_private; + struct btree *b = wbio->wbio.bio.bi_private; struct closure *cl = wbio->cl; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct bkey_i_extent *new_key; @@ -1473,7 +1473,7 @@ retry: new_key = bkey_i_to_extent(&tmp.k); e = extent_i_to_s(new_key); extent_for_each_ptr_backwards(e, ptr) - if (bch2_dev_list_has_dev(wbio->failed, ptr->dev)) + if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)) bch2_extent_drop_ptr(e, ptr); if (!bch2_extent_nr_ptrs(e.c)) @@ -1486,7 +1486,7 @@ retry: goto err; out: bch2_btree_iter_unlock(&iter); - bio_put(&wbio->bio); + bio_put(&wbio->wbio.bio); btree_node_write_done(c, b); if (cl) closure_put(cl); @@ -1511,17 +1511,46 @@ void bch2_btree_write_error_work(struct work_struct *work) if (!bio) break; - bch2_btree_node_write_error(c, to_wbio(bio)); + bch2_btree_node_write_error(c, + container_of(bio, struct btree_write_bio, wbio.bio)); } } +static void btree_node_write_work(struct work_struct *work) +{ + struct btree_write_bio *wbio = + container_of(work, struct btree_write_bio, work); + struct closure *cl = wbio->cl; + struct bch_fs *c = wbio->wbio.c; + struct btree *b = wbio->wbio.bio.bi_private; + + btree_bounce_free(c, + wbio->wbio.order, + wbio->wbio.used_mempool, + wbio->data); + + if (wbio->wbio.failed.nr) { + unsigned long flags; + + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + + queue_work(c->wq, &c->btree_write_error_work); + return; + } + + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); + if (cl) + closure_put(cl); +} + static void btree_node_write_endio(struct bio *bio) { - struct btree *b = bio->bi_private; struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_write_bio *orig = parent ?: wbio; - struct closure *cl = !wbio->split ? wbio->cl : NULL; struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->ca; unsigned long flags; @@ -1542,27 +1571,13 @@ static void btree_node_write_endio(struct bio *bio) if (parent) { bio_put(bio); bio_endio(&parent->bio); - return; + } else { + struct btree_write_bio *wb = + container_of(orig, struct btree_write_bio, wbio); + + INIT_WORK(&wb->work, btree_node_write_work); + schedule_work(&wb->work); } - - btree_bounce_free(c, - wbio->order, - wbio->used_mempool, - wbio->data); - - if (wbio->failed.nr) { - spin_lock_irqsave(&c->btree_write_error_lock, flags); - bio_list_add(&c->btree_write_error_list, &wbio->bio); - spin_unlock_irqrestore(&c->btree_write_error_lock, flags); - - queue_work(c->wq, &c->btree_write_error_work); - return; - } - - bio_put(bio); - btree_node_write_done(c, b); - if (cl) - closure_put(cl); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, @@ -1586,7 +1601,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct closure *parent, enum six_lock_type lock_type_held) { - struct bch_write_bio *wbio; + struct btree_write_bio *wbio; struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; @@ -1602,6 +1617,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned long old, new; void *data; + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + return; + /* * We may only have a read lock on the btree node - the dirty bit is our * "lock" against racing with other threads that may be trying to start @@ -1631,6 +1649,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); + BUG_ON(btree_node_fake(b)); BUG_ON(!list_empty(&b->write_blocked)); BUG_ON((b->will_make_reachable != NULL) != !b->written); @@ -1763,21 +1782,22 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, trace_btree_write(b, bytes_to_write, sectors_to_write); - wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write)); - wbio->cl = parent; - wbio->failed.nr = 0; - wbio->order = order; - wbio->used_mempool = used_mempool; - wbio->data = data; - wbio->bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; - wbio->bio.bi_iter.bi_size = sectors_to_write << 9; - wbio->bio.bi_end_io = btree_node_write_endio; - wbio->bio.bi_private = b; + wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio), + struct btree_write_bio, wbio.bio); + wbio_init(&wbio->wbio.bio); + wbio->data = data; + wbio->cl = parent; + wbio->wbio.order = order; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; + wbio->wbio.bio.bi_iter.bi_size = sectors_to_write << 9; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; + wbio->wbio.bio.bi_private = b; if (parent) closure_get(parent); - bch2_bio_map(&wbio->bio, data); + bch2_bio_map(&wbio->wbio.bio, data); /* * If we're appending to a leaf node, we don't technically need FUA - @@ -1802,7 +1822,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->written += sectors_to_write; - bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key); + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); return; err: set_btree_node_noevict(b); @@ -1905,11 +1925,7 @@ void bch2_btree_verify_flushed(struct bch_fs *c) unsigned i; rcu_read_lock(); - tbl = rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - - for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(b, pos, tbl, i, hash) - BUG_ON(btree_node_dirty(b)); + for_each_cached_btree(b, c, tbl, i, pos) + BUG_ON(btree_node_dirty(b)); rcu_read_unlock(); } diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 61165a63..c8417ac3 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -2,6 +2,7 @@ #define _BCACHEFS_BTREE_IO_H #include "extents.h" +#include "io_types.h" struct bch_fs; struct btree_write; @@ -17,6 +18,13 @@ struct btree_read_bio { struct bio bio; }; +struct btree_write_bio { + struct closure *cl; + void *data; + struct work_struct work; + struct bch_write_bio wbio; +}; + static inline void btree_node_io_unlock(struct btree *b) { EBUG_ON(!btree_node_write_in_flight(b)); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 0b505a73..ee463f36 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -202,21 +202,20 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* Btree iterator locking: */ - static void btree_iter_drop_extra_locks(struct btree_iter *iter) { unsigned l; while (iter->nodes_locked && (l = __fls(iter->nodes_locked)) > iter->locks_want) { - if (!btree_node_locked(iter, l)) - panic("l %u nodes_locked %u\n", l, iter->nodes_locked); - if (l > iter->level) { btree_node_unlock(iter, l); - } else if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->nodes[l]->lock); - iter->nodes_intent_locked ^= 1 << l; + } else { + if (btree_node_intent_locked(iter, l)) { + six_lock_downgrade(&iter->nodes[l]->lock); + iter->nodes_intent_locked ^= 1 << l; + } + break; } } } @@ -861,7 +860,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) i < iter->locks_want && iter->nodes[i]; i++) if (!bch2_btree_node_relock(iter, i)) { - while (iter->nodes[iter->level] && + while (iter->level < BTREE_MAX_DEPTH && + iter->nodes[iter->level] && iter->level + 1 < iter->locks_want) btree_iter_up(iter); break; @@ -872,7 +872,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * If the current node isn't locked, go up until we have a locked node * or run out of nodes: */ - while (iter->nodes[iter->level] && + while (iter->level < BTREE_MAX_DEPTH && + iter->nodes[iter->level] && !(is_btree_node(iter, iter->level) && bch2_btree_node_relock(iter, iter->level) && btree_iter_pos_cmp(iter->pos, @@ -884,7 +885,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * If we've got a btree node locked (i.e. we aren't about to relock the * root) - advance its node iterator if necessary: */ - if (iter->nodes[iter->level]) { + if (iter->level < BTREE_MAX_DEPTH && + iter->nodes[iter->level]) { struct bkey_s_c k; while ((k = __btree_iter_peek_all(iter)).k && @@ -956,7 +958,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) btree_iter_up(iter); - if (!iter->nodes[iter->level]) + if (iter->level == BTREE_MAX_DEPTH || + !iter->nodes[iter->level]) return NULL; /* parent node usually won't be locked: redo traversal if necessary */ diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index a7fdba82..eb196a3a 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -50,10 +50,8 @@ struct btree_iter { * always fail (but since freeing a btree node takes a write lock on the * node, which increments the node's lock seq, that's not actually * necessary in that example). - * - * One extra slot for a sentinel NULL: */ - struct btree *nodes[BTREE_MAX_DEPTH + 1]; + struct btree *nodes[BTREE_MAX_DEPTH]; struct btree_node_iter node_iters[BTREE_MAX_DEPTH]; /* diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index acfe5b59..ca2992ba 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -92,6 +92,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) int lock_type = btree_node_locked_type(iter, level); EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE); + EBUG_ON(level >= BTREE_MAX_DEPTH); if (lock_type != BTREE_NODE_UNLOCKED) six_unlock_type(&iter->nodes[level]->lock, lock_type); @@ -106,6 +107,8 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, struct btree_iter *iter, enum six_lock_type type) { + EBUG_ON(level >= BTREE_MAX_DEPTH); + return likely(six_trylock_type(&b->lock, type)) || __bch2_btree_node_lock(b, pos, level, iter, type); } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index f0e6896a..fb2f7e21 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -197,6 +197,7 @@ enum btree_flags { BTREE_NODE_write_in_flight, BTREE_NODE_just_written, BTREE_NODE_dying, + BTREE_NODE_fake, }; BTREE_FLAG(read_in_flight); @@ -209,6 +210,7 @@ BTREE_FLAG(accessed); BTREE_FLAG(write_in_flight); BTREE_FLAG(just_written); BTREE_FLAG(dying); +BTREE_FLAG(fake); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 04854532..a0f37c4c 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -546,8 +546,8 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, goto err_free; } - ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), - BCH_DATA_BTREE); + ret = bch2_check_mark_super(c, BCH_DATA_BTREE, + bch2_bkey_devs(bkey_i_to_s_c(&b->key))); if (ret) goto err_free; @@ -915,6 +915,10 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, struct bset_tree *t; set_btree_node_dying(b); + + if (btree_node_fake(b)) + return; + btree_interior_update_add_node_reference(as, b); /* @@ -1052,7 +1056,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) gc_pos_btree_root(b->btree_id), &stats, 0, 0); - if (old) + if (old && !btree_node_fake(old)) bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&old->key), &stats); @@ -1422,7 +1426,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, bch2_btree_node_lock_for_insert(c, b, iter); - if (bch_keylist_u64s(keys) > bch_btree_keys_u64s_remaining(c, b)) { + if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) { bch2_btree_node_unlock_write(b, iter); return -1; } @@ -1957,7 +1961,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, goto err; } - ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE); + ret = bch2_check_mark_super(c, BCH_DATA_BTREE, + bch2_extent_devs(extent_i_to_s_c(new_key))); if (ret) goto err_free_update; @@ -1993,45 +1998,43 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_ondisk(c, b, READ); } -int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, - struct closure *writes) +void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) { - struct btree_update *as; struct closure cl; struct btree *b; + int ret; - memset(&as, 0, sizeof(as)); closure_init_stack(&cl); - while (1) { - /* XXX haven't calculated capacity yet :/ */ - as = bch2_btree_update_start(c, id, 1, - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); closure_sync(&cl); + } while (ret); - if (!IS_ERR(as)) - break; + b = bch2_btree_node_mem_alloc(c); + bch2_btree_cache_cannibalize_unlock(c); - if (PTR_ERR(as) == -ENOSPC) - return PTR_ERR(as); - } + set_btree_node_fake(b); + b->level = 0; + b->btree_id = id; - b = __btree_root_alloc(as, 0); + bkey_extent_init(&b->key); + b->key.k.p = POS_MAX; + bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id; - bch2_btree_node_write(c, b, writes, SIX_LOCK_intent); - btree_update_drop_new_node(c, b); + bch2_bset_init_first(b, &b->data->keys); + bch2_btree_build_aux_trees(b); - BUG_ON(btree_node_root(c, b)); + b->data->min_key = POS_MIN; + b->data->max_key = POS_MAX; + b->data->format = bch2_btree_calc_format(b); + btree_node_set_format(b, b->data->format); - bch2_btree_set_root_inmem(as, b); - bch2_btree_set_root_ondisk(c, b, WRITE); + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id); + BUG_ON(ret); - bch2_btree_open_bucket_put(c, b); + __bch2_btree_set_root_inmem(c, b); + + six_unlock_write(&b->lock); six_unlock_intent(&b->lock); - - bch2_btree_update_free(as); - - return 0; } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index e129b24e..23ee3980 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -150,7 +150,7 @@ int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, enum btree_node_sibling); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *); +void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) @@ -280,6 +280,9 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, struct btree *b, unsigned u64s) { + if (unlikely(btree_node_fake(b))) + return false; + if (btree_node_is_extents(b)) { /* The insert key might split an existing key * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 2dbe7d37..43133cbb 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -258,6 +258,11 @@ static u64 reserve_factor(u64 r) return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); } +static u64 avail_factor(u64 r) +{ + return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1; +} + u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) { struct fs_usage_sum sum = __fs_usage_sum(stats); @@ -270,6 +275,11 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) return min(c->capacity, __bch2_fs_sectors_used(c, stats)); } +u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats) +{ + return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats)); +} + static inline int is_unavailable_bucket(struct bucket_mark m) { return !is_available_bucket(m); @@ -382,7 +392,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, } new.owned_by_allocator = 1; - new.touched_this_mount = 1; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; @@ -396,29 +405,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, return true; } -bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca, - size_t b) -{ - struct bucket *g; - struct bucket_mark new, old; - - lg_local_lock(&c->usage_lock); - g = bucket(ca, b); - - old = bucket_data_cmpxchg(c, ca, g, new, ({ - if (!is_startup_available_bucket(new)) { - lg_local_unlock(&c->usage_lock); - return false; - } - - new.owned_by_allocator = 1; - new.touched_this_mount = 1; - })); - lg_local_unlock(&c->usage_lock); - - return true; -} - void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) @@ -436,7 +422,6 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, } old = bucket_data_cmpxchg(c, ca, g, new, ({ - new.touched_this_mount = 1; new.owned_by_allocator = owned_by_allocator; })); lg_local_unlock(&c->usage_lock); @@ -481,7 +466,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, saturated_add(ca, new.dirty_sectors, sectors, GC_MAX_SECTORS_USED); new.data_type = type; - new.touched_this_mount = 1; })); lg_local_unlock(&c->usage_lock); @@ -539,7 +523,6 @@ static void bch2_mark_pointer(struct bch_fs *c, if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { if (journal_seq) bucket_cmpxchg(g, new, ({ - new.touched_this_mount = 1; new.journal_seq_valid = 1; new.journal_seq = journal_seq; })); @@ -588,8 +571,6 @@ static void bch2_mark_pointer(struct bch_fs *c, new.data_type = data_type; } - new.touched_this_mount = 1; - if (flags & BCH_BUCKET_MARK_NOATOMIC) { g->_mark = new; break; @@ -694,17 +675,12 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, static u64 __recalc_sectors_available(struct bch_fs *c) { - u64 avail; int cpu; for_each_possible_cpu(cpu) per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; - avail = c->capacity - bch2_fs_sectors_used(c, bch2_fs_usage_read(c)); - - avail <<= RESERVE_FACTOR; - avail /= (1 << RESERVE_FACTOR) + 1; - return avail; + return bch2_fs_sectors_free(c, bch2_fs_usage_read(c)); } /* Used by gc when it's starting: */ @@ -839,7 +815,7 @@ static void buckets_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { - struct bucket_array *buckets = NULL, *old_buckets; + struct bucket_array *buckets = NULL, *old_buckets = NULL; unsigned long *buckets_dirty = NULL; u8 *oldest_gens = NULL; alloc_fifo free[RESERVE_NR]; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 78243129..86e72829 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -184,6 +184,7 @@ void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); +u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage); static inline bool is_available_bucket(struct bucket_mark mark) { @@ -192,11 +193,6 @@ static inline bool is_available_bucket(struct bucket_mark mark) !mark.nouse); } -static inline bool is_startup_available_bucket(struct bucket_mark mark) -{ - return !mark.touched_this_mount && is_available_bucket(mark); -} - static inline bool bucket_needs_journal_commit(struct bucket_mark m, u16 last_seq_ondisk) { @@ -208,8 +204,6 @@ void bch2_bucket_seq_cleanup(struct bch_fs *); bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, size_t, struct bucket_mark *); -bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *, - size_t); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 7cd8439a..6f52a109 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -15,8 +15,7 @@ struct bucket_mark { gen_valid:1, owned_by_allocator:1, nouse:1, - journal_seq_valid:1, - touched_this_mount:1; + journal_seq_valid:1; u16 dirty_sectors; u16 cached_sectors; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 1618ffe7..1498832b 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -64,7 +64,7 @@ found: static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) { struct bch_ioctl_assemble arg; - const char *err; + struct bch_fs *c; u64 *user_devs = NULL; char **devs = NULL; unsigned i; @@ -96,14 +96,10 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) } } - err = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty(), NULL); - if (err) { - pr_err("Could not open filesystem: %s", err); - ret = -EINVAL; - goto err; - } - - ret = 0; + c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); + ret = PTR_ERR_OR_ZERO(c); + if (!ret) + closure_put(&c->cl); err: if (devs) for (i = 0; i < arg.nr_devs; i++) diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index ccfb0386..0f090ca5 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -58,7 +58,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) if (IS_ERR_OR_NULL(pick.ca)) return; - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); bio->bi_bdev = pick.ca->disk_sb.bdev; bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 28fe4fce..ac3e96d2 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -143,9 +143,6 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define __fsck_err_on(cond, c, _flags, ...) \ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) -#define unfixable_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE, ##__VA_ARGS__) - #define need_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 2b4a2dc2..bceea486 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -666,7 +666,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, goto err; } - if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) { + if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, @@ -1803,7 +1803,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, } if (!bkey_extent_is_cached(e.k) && - !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) { + !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index aeae361d..eda34381 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -426,6 +426,17 @@ static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent return ret; } +static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + return bch2_extent_devs(bkey_s_c_to_extent(k)); + default: + return (struct bch_devs_list) { .nr = 0 }; + } +} + bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, struct bch_extent_crc_unpacked); bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 98f22f6a..08739d26 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -57,6 +57,7 @@ do { \ #define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) #define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) +#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] #define fifo_push_back_ref(f) \ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 2c34a85c..66374a9c 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -12,6 +12,7 @@ #include "journal.h" #include "io.h" #include "keylist.h" +#include "quota.h" #include #include @@ -56,14 +57,13 @@ struct bch_writepage_io { struct dio_write { struct closure cl; struct kiocb *req; - struct bch_fs *c; - loff_t offset; - - struct iovec *iovec; - struct iovec inline_vecs[UIO_FASTIOV]; - struct iov_iter iter; - struct task_struct *task; + unsigned loop:1, + sync:1, + free_iov:1; + + struct iov_iter iter; + struct iovec inline_vecs[2]; /* must be last: */ struct bchfs_write_op iop; @@ -130,6 +130,7 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c, static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors) { inode->v.i_blocks += sectors; + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN); } static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors) @@ -1286,7 +1287,8 @@ static int bch2_read_single_page(struct page *page, int ret; DECLARE_COMPLETION_ONSTACK(done); - rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read)); + rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), + io_opts(c, inode)); rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; @@ -1439,13 +1441,15 @@ static void bch2_direct_IO_read_split_endio(struct bio *bio) bio_check_pages_dirty(bio); /* transfers ownership */ } -static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req, - struct file *file, struct bch_inode_info *inode, - struct iov_iter *iter, loff_t offset) +static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) { + struct file *file = req->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, inode); struct dio_read *dio; struct bio *bio; + loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); ssize_t ret; @@ -1525,103 +1529,128 @@ start: } } -static long __bch2_dio_write_complete(struct dio_write *dio) +static void bch2_dio_write_loop_async(struct closure *); + +static long bch2_dio_write_loop(struct dio_write *dio) { - struct file *file = dio->req->ki_filp; + struct kiocb *req = dio->req; + struct file *file = req->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9); - - bch2_disk_reservation_put(dio->c, &dio->iop.op.res); - - __pagecache_block_put(&mapping->add_lock); - inode_dio_end(&inode->v); - - if (dio->iovec && dio->iovec != dio->inline_vecs) - kfree(dio->iovec); - - bio_put(&dio->iop.op.wbio.bio); - return ret; -} - -static void bch2_dio_write_complete(struct closure *cl) -{ - struct dio_write *dio = container_of(cl, struct dio_write, cl); - struct kiocb *req = dio->req; - - req->ki_complete(req, __bch2_dio_write_complete(dio), 0); -} - -static void bch2_dio_write_done(struct dio_write *dio) -{ + struct bio *bio = &dio->iop.op.wbio.bio; struct bio_vec *bv; + bool sync; + long ret; int i; - bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i) - put_page(bv->bv_page); + if (dio->loop) + goto loop; - if (dio->iter.count) - bio_reset(&dio->iop.op.wbio.bio); -} + inode_dio_begin(&inode->v); + __pagecache_block_get(&mapping->add_lock); -static void bch2_do_direct_IO_write(struct dio_write *dio) -{ - struct file *file = dio->req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bio *bio = &dio->iop.op.wbio.bio; - int ret; + /* Write and invalidate pagecache range that we're writing to: */ + ret = write_invalidate_inode_pages_range(mapping, req->ki_pos, + req->ki_pos + iov_iter_count(&dio->iter) - 1); + if (unlikely(ret)) + goto err; - ret = bio_iov_iter_get_pages(bio, &dio->iter); - if (ret < 0) { - dio->iop.op.error = ret; - return; + while (1) { + BUG_ON(current->pagecache_lock); + current->pagecache_lock = &mapping->add_lock; + if (current != dio->task) + use_mm(dio->task->mm); + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + + if (current != dio->task) + unuse_mm(dio->task->mm); + current->pagecache_lock = NULL; + + if (unlikely(ret < 0)) + goto err; + + dio->iop.op.pos = POS(inode->v.i_ino, + (req->ki_pos >> 9) + dio->iop.op.written); + + task_io_account_write(bio->bi_iter.bi_size); + + closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl); + + if (!dio->sync && !dio->loop && dio->iter.count) { + struct iovec *iov = dio->inline_vecs; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) { + dio->iop.op.error = -ENOMEM; + goto err_wait_io; + } + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.iov = iov; + } +err_wait_io: + dio->loop = true; + + if (!dio->sync) { + continue_at_noreturn(&dio->cl, + bch2_dio_write_loop_async, NULL); + return -EIOCBQUEUED; + } + + closure_sync(&dio->cl); +loop: + bio_for_each_segment_all(bv, bio, i) + put_page(bv->bv_page); + if (!dio->iter.count || dio->iop.op.error) + break; + bio_reset(bio); } - dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written); + ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9); +err: + __pagecache_block_put(&mapping->add_lock); + inode_dio_end(&inode->v); + bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res); - task_io_account_write(bio->bi_iter.bi_size); + if (dio->free_iov) + kfree(dio->iter.iov); - closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl); + closure_debug_destroy(&dio->cl); + + sync = dio->sync; + bio_put(bio); + + if (!sync) { + req->ki_complete(req, ret, 0); + ret = -EIOCBQUEUED; + } + return ret; } static void bch2_dio_write_loop_async(struct closure *cl) { - struct dio_write *dio = - container_of(cl, struct dio_write, cl); - struct address_space *mapping = dio->req->ki_filp->f_mapping; + struct dio_write *dio = container_of(cl, struct dio_write, cl); - bch2_dio_write_done(dio); - - if (dio->iter.count && !dio->iop.op.error) { - use_mm(dio->task->mm); - pagecache_block_get(&mapping->add_lock); - - bch2_do_direct_IO_write(dio); - - pagecache_block_put(&mapping->add_lock); - unuse_mm(dio->task->mm); - - continue_at(&dio->cl, bch2_dio_write_loop_async, NULL); - } else { -#if 0 - closure_return_with_destructor(cl, bch2_dio_write_complete); -#else - closure_debug_destroy(cl); - bch2_dio_write_complete(cl); -#endif - } + bch2_dio_write_loop(dio); } -static int bch2_direct_IO_write(struct bch_fs *c, - struct kiocb *req, struct file *file, - struct bch_inode_info *inode, - struct iov_iter *iter, loff_t offset) +static int bch2_direct_IO_write(struct kiocb *req, + struct iov_iter *iter, + bool swap) { - struct address_space *mapping = file->f_mapping; + struct file *file = req->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; struct dio_write *dio; struct bio *bio; + loff_t offset = req->ki_pos; ssize_t ret; - bool sync = is_sync_kiocb(req); lockdep_assert_held(&inode->v.i_rwsem); @@ -1637,95 +1666,49 @@ static int bch2_direct_IO_write(struct bch_fs *c, dio = container_of(bio, struct dio_write, iop.op.wbio.bio); closure_init(&dio->cl, NULL); dio->req = req; - dio->c = c; - dio->offset = offset; - dio->iovec = NULL; - dio->iter = *iter; dio->task = current; + dio->loop = false; + dio->sync = is_sync_kiocb(req) || + offset + iter->count > inode->v.i_size; + dio->free_iov = false; + dio->iter = *iter; bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true); dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task); dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION; - if ((dio->req->ki_flags & IOCB_DSYNC) && + if ((req->ki_flags & IOCB_DSYNC) && !c->opts.journal_flush_disabled) dio->iop.op.flags |= BCH_WRITE_FLUSH; - if (offset + iter->count > inode->v.i_size) - sync = true; - - /* - * XXX: we shouldn't return -ENOSPC if we're overwriting existing data - - * if getting a reservation fails we should check if we are doing an - * overwrite. - * - * Have to then guard against racing with truncate (deleting data that - * we would have been overwriting) - */ ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0); if (unlikely(ret)) { if (bch2_check_range_allocated(c, POS(inode->v.i_ino, offset >> 9), - iter->count >> 9)) { - closure_debug_destroy(&dio->cl); - bio_put(bio); - return ret; - } + iter->count >> 9)) + goto err; dio->iop.unalloc = true; } dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas; - inode_dio_begin(&inode->v); - __pagecache_block_get(&mapping->add_lock); - - if (sync) { - do { - bch2_do_direct_IO_write(dio); - - closure_sync(&dio->cl); - bch2_dio_write_done(dio); - } while (dio->iter.count && !dio->iop.op.error); - - closure_debug_destroy(&dio->cl); - return __bch2_dio_write_complete(dio); - } else { - bch2_do_direct_IO_write(dio); - - if (dio->iter.count && !dio->iop.op.error) { - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - dio->iovec = kmalloc(dio->iter.nr_segs * - sizeof(struct iovec), - GFP_KERNEL); - if (!dio->iovec) - dio->iop.op.error = -ENOMEM; - } else { - dio->iovec = dio->inline_vecs; - } - - memcpy(dio->iovec, - dio->iter.iov, - dio->iter.nr_segs * sizeof(struct iovec)); - dio->iter.iov = dio->iovec; - } - - continue_at_noreturn(&dio->cl, bch2_dio_write_loop_async, NULL); - return -EIOCBQUEUED; - } + return bch2_dio_write_loop(dio); +err: + bch2_disk_reservation_put(c, &dio->iop.op.res); + closure_debug_destroy(&dio->cl); + bio_put(bio); + return ret; } ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) { - struct file *file = req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; struct blk_plug plug; ssize_t ret; blk_start_plug(&plug); - ret = ((iov_iter_rw(iter) == WRITE) - ? bch2_direct_IO_write - : bch2_direct_IO_read)(c, req, file, inode, iter, req->ki_pos); + ret = iov_iter_rw(iter) == WRITE + ? bch2_direct_IO_write(req, iter, false) + : bch2_direct_IO_read(req, iter); blk_finish_plug(&plug); return ret; @@ -1734,26 +1717,7 @@ ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) static ssize_t bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = file->f_mapping; - loff_t pos = iocb->ki_pos; - ssize_t ret; - - pagecache_block_get(&mapping->add_lock); - - /* Write and invalidate pagecache range that we're writing to: */ - ret = write_invalidate_inode_pages_range(file->f_mapping, pos, - pos + iov_iter_count(iter) - 1); - if (unlikely(ret)) - goto err; - - ret = bch2_direct_IO_write(c, iocb, file, inode, iter, pos); -err: - pagecache_block_put(&mapping->add_lock); - - return ret; + return bch2_direct_IO_write(iocb, iter, true); } static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 24228c8e..6ae67f92 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -4,6 +4,7 @@ #include "chardev.h" #include "fs.h" #include "fs-ioctl.h" +#include "quota.h" #include #include @@ -154,10 +155,32 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, struct fsxattr fa = { 0 }; fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); + fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; return copy_to_user(arg, &fa, sizeof(fa)); } +static int bch2_set_projid(struct bch_fs *c, + struct bch_inode_info *inode, + u32 projid) +{ + struct bch_qid qid = inode->ei_qid; + int ret; + + if (projid == inode->ei_qid.q[QTYP_PRJ]) + return 0; + + qid.q[QTYP_PRJ] = projid; + + ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid, + inode->v.i_blocks); + if (ret) + return ret; + + inode->ei_qid.q[QTYP_PRJ] = projid; + return 0; +} + static int bch2_ioc_fssetxattr(struct bch_fs *c, struct file *file, struct bch_inode_info *inode, @@ -185,9 +208,14 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); + ret = bch2_set_projid(c, inode, fa.fsx_projid); + if (ret) + goto err_unlock; + ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &flags); if (!ret) bch2_inode_flags_to_vfs(inode); +err_unlock: mutex_unlock(&inode->ei_update_lock); err: inode_unlock(&inode->v); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 472df23a..8869ba0f 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -15,6 +15,7 @@ #include "io.h" #include "journal.h" #include "keylist.h" +#include "quota.h" #include "super.h" #include "xattr.h" @@ -116,6 +117,7 @@ int __must_check __bch2_write_inode(struct bch_fs *c, inode_u.bi_mode = inode->v.i_mode; inode_u.bi_uid = i_uid_read(&inode->v); inode_u.bi_gid = i_gid_read(&inode->v); + inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ]; inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode); inode_u.bi_dev = inode->v.i_rdev; inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime); @@ -131,8 +133,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c, BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); } while (ret == -EINTR); - if (!ret) + if (!ret) { inode->ei_inode = inode_u; + inode->ei_qid = bch_qid(&inode_u); + } out: bch2_btree_iter_unlock(&iter); @@ -215,7 +219,7 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c, ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl); if (ret) { make_bad_inode(&inode->v); - goto err; + goto err_make_bad; } #endif @@ -225,16 +229,20 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c, inode->v.i_mode, rdev, &dir->ei_inode); + inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ]; + + ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC); + if (ret) { + make_bad_inode(&inode->v); + goto err_make_bad; + } + ret = bch2_inode_create(c, &inode_u, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); if (unlikely(ret)) { - /* - * indicate to bch_evict_inode that the inode was never actually - * created: - */ - make_bad_inode(&inode->v); - goto err; + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN); + goto err_make_bad; } bch2_vfs_inode_init(c, inode, &inode_u); @@ -257,6 +265,12 @@ out: posix_acl_release(default_acl); posix_acl_release(acl); return inode; +err_make_bad: + /* + * indicate to bch_evict_inode that the inode was never actually + * created: + */ + make_bad_inode(&inode->v); err: clear_nlink(&inode->v); iput(&inode->v); @@ -604,11 +618,53 @@ static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry, return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry); } +static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_qid qid = inode->ei_qid; + unsigned qtypes = 0; + int ret; + + mutex_lock(&inode->ei_update_lock); + + if (c->opts.usrquota && + (iattr->ia_valid & ATTR_UID) && + !uid_eq(iattr->ia_uid, inode->v.i_uid)) { + qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid), + qtypes |= 1 << QTYP_USR; + } + + if (c->opts.grpquota && + (iattr->ia_valid & ATTR_GID) && + !gid_eq(iattr->ia_gid, inode->v.i_gid)) { + qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid); + qtypes |= 1 << QTYP_GRP; + } + + if (qtypes) { + ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid, + inode->v.i_blocks); + if (ret) + goto out_unlock; + } + + setattr_copy(&inode->v, iattr); + + ret = bch2_write_inode(c, inode); +out_unlock: + mutex_unlock(&inode->ei_update_lock); + + if (!ret && + iattr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(&inode->v, inode->v.i_mode); + + return ret; +} + static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = 0; + int ret; lockdep_assert_held(&inode->v.i_rwsem); @@ -616,22 +672,9 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) if (ret) return ret; - if (iattr->ia_valid & ATTR_SIZE) { - ret = bch2_truncate(inode, iattr); - } else { - mutex_lock(&inode->ei_update_lock); - setattr_copy(&inode->v, iattr); - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); - } - - if (unlikely(ret)) - return ret; - - if (iattr->ia_valid & ATTR_MODE) - ret = posix_acl_chmod(&inode->v, inode->v.i_mode); - - return ret; + return iattr->ia_valid & ATTR_SIZE + ? bch2_truncate(inode, iattr) + : bch2_setattr_nonsize(inode, iattr); } static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) @@ -910,6 +953,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); inode->ei_journal_seq = 0; + inode->ei_qid = bch_qid(bi); inode->ei_str_hash = bch2_hash_info_init(c, bi); inode->ei_inode = *bi; @@ -995,6 +1039,10 @@ static void bch2_evict_inode(struct inode *vinode) clear_inode(&inode->v); if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), + BCH_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + BCH_QUOTA_WARN); bch2_inode_rm(c, inode->v.i_ino); atomic_long_dec(&c->nr_inodes); } @@ -1009,8 +1057,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT; - buf->f_bfree = (c->capacity - - bch2_fs_sectors_used(c, bch2_fs_usage_read(c))) >> + buf->f_bfree = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >> PAGE_SECTOR_SHIFT; buf->f_bavail = buf->f_bfree; buf->f_files = atomic_long_read(&c->nr_inodes); @@ -1037,17 +1084,83 @@ static int bch2_sync_fs(struct super_block *sb, int wait) return bch2_journal_flush(&c->journal); } +static struct bch_fs *bch2_path_to_fs(const char *dev) +{ + struct bch_fs *c; + struct block_device *bdev = lookup_bdev(dev); + + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + c = bch2_bdev_to_fs(bdev); + bdput(bdev); + return c ?: ERR_PTR(-ENOENT); +} + +static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, + unsigned nr_devs, struct bch_opts opts) +{ + struct bch_fs *c, *c1, *c2; + size_t i; + + if (!nr_devs) + return ERR_PTR(-EINVAL); + + c = bch2_fs_open(devs, nr_devs, opts); + + if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { + /* + * Already open? + * Look up each block device, make sure they all belong to a + * filesystem and they all belong to the _same_ filesystem + */ + + c1 = bch2_path_to_fs(devs[0]); + if (!c1) + return c; + + for (i = 1; i < nr_devs; i++) { + c2 = bch2_path_to_fs(devs[i]); + if (!IS_ERR(c2)) + closure_put(&c2->cl); + + if (c1 != c2) { + closure_put(&c1->cl); + return c; + } + } + + c = c1; + } + + if (IS_ERR(c)) + return c; + + mutex_lock(&c->state_lock); + + if (!bch2_fs_running(c)) { + mutex_unlock(&c->state_lock); + closure_put(&c->cl); + pr_err("err mounting %s: incomplete filesystem", dev_name); + return ERR_PTR(-EINVAL); + } + + mutex_unlock(&c->state_lock); + + set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); + return c; +} + static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, struct bch_opts opts) { - size_t nr_devs = 0, i = 0; - char *dev_name, *s, **devs; - struct bch_fs *c = NULL; - const char *err = "cannot allocate memory"; + char *dev_name = NULL, **devs = NULL, *s; + struct bch_fs *c = ERR_PTR(-ENOMEM); + size_t i, nr_devs = 0; dev_name = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) - return NULL; + goto err; for (s = dev_name; s; s = strchr(s + 1, ':')) nr_devs++; @@ -1061,57 +1174,10 @@ static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, (s = strchr(s, ':')) && (*s++ = '\0')) devs[i++] = s; - err = bch2_fs_open(devs, nr_devs, opts, &c); - if (err) { - /* - * Already open? - * Look up each block device, make sure they all belong to a - * filesystem and they all belong to the _same_ filesystem - */ - - for (i = 0; i < nr_devs; i++) { - struct block_device *bdev = lookup_bdev(devs[i]); - struct bch_fs *c2; - - if (IS_ERR(bdev)) - goto err; - - c2 = bch2_bdev_to_fs(bdev); - bdput(bdev); - - if (!c) - c = c2; - else if (c2) - closure_put(&c2->cl); - - if (!c) - goto err; - if (c != c2) { - closure_put(&c->cl); - goto err; - } - } - - mutex_lock(&c->state_lock); - - if (!bch2_fs_running(c)) { - mutex_unlock(&c->state_lock); - closure_put(&c->cl); - err = "incomplete filesystem"; - c = NULL; - goto err; - } - - mutex_unlock(&c->state_lock); - } - - set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); + c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); err: kfree(devs); kfree(dev_name); - - if (!c) - pr_err("bch_fs_open err %s", err); return c; } @@ -1234,8 +1300,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, return ERR_PTR(ret); c = bch2_open_as_blockdevs(dev_name, opts); - if (!c) - return ERR_PTR(-ENOENT); + if (IS_ERR(c)) + return ERR_CAST(c); sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c); if (IS_ERR(sb)) { @@ -1261,6 +1327,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &bch_super_operations; sb->s_export_op = &bch_export_ops; +#ifdef CONFIG_BCACHEFS_QUOTA + sb->s_qcop = &bch2_quotactl_operations; + sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; +#endif sb->s_xattr = bch2_xattr_handlers; sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_time_gran = c->sb.time_precision; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 652105fb..dd0bd4ef 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -3,6 +3,7 @@ #include "opts.h" #include "str_hash.h" +#include "quota_types.h" #include #include @@ -13,6 +14,7 @@ struct bch_inode_info { struct mutex ei_update_lock; u64 ei_journal_seq; unsigned long ei_last_dirtied; + struct bch_qid ei_qid; struct bch_hash_info ei_str_hash; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 696926fe..ef09c131 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -266,26 +266,60 @@ static int check_extents(struct bch_fs *c) !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, "extent type %u for non regular file, inode %llu mode %o", k.k->type, k.k->p.inode, w.inode.bi_mode)) { - ret = bch2_btree_delete_at(&iter, 0); + bch2_btree_iter_unlock(&iter); + + ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL); if (ret) goto err; continue; } - unfixable_fsck_err_on(w.first_this_inode && + if (fsck_err_on(w.first_this_inode && w.have_inode && !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && w.inode.bi_sectors != (i_sectors = bch2_count_inode_sectors(c, w.cur_inum)), c, "i_sectors wrong: got %llu, should be %llu", - w.inode.bi_sectors, i_sectors); + w.inode.bi_sectors, i_sectors)) { + struct bkey_inode_buf p; - unfixable_fsck_err_on(w.have_inode && + w.inode.bi_sectors = i_sectors; + + bch2_btree_iter_unlock(&iter); + + bch2_inode_pack(&p, &w.inode); + + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &p.inode.k_i, + NULL, + NULL, + NULL, + BTREE_INSERT_NOFAIL); + if (ret) { + bch_err(c, "error in fs gc: error %i " + "updating inode", ret); + goto err; + } + + /* revalidate iterator: */ + k = bch2_btree_iter_peek(&iter); + } + + if (fsck_err_on(w.have_inode && !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && k.k->type != BCH_RESERVATION && k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c, "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size); + k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { + bch2_btree_iter_unlock(&iter); + + ret = bch2_inode_truncate(c, k.k->p.inode, + round_up(w.inode.bi_size, PAGE_SIZE) >> 9, + NULL, NULL); + if (ret) + goto err; + continue; + } } err: fsck_err: @@ -999,7 +1033,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, u64 nlinks_pos; bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0); - genradix_iter_init(&nlinks_iter); + nlinks_iter = genradix_iter_init(links, 0); while ((k = bch2_btree_iter_peek(&iter)).k && !btree_iter_err(k)) { diff --git a/libbcachefs/io.c b/libbcachefs/io.c index e045eb20..6f6d42fc 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -268,7 +268,8 @@ static void bch2_write_index(struct closure *cl) } if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) { - ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER); + ret = bch2_check_mark_super(c, BCH_DATA_USER, + bch2_extent_devs(e.c)); if (ret) goto err; } diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index ff18fdc9..32ecac24 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -67,10 +67,7 @@ struct bch_read_bio { struct bch_write_bio { struct bch_fs *c; struct bch_dev *ca; - union { struct bch_write_bio *parent; - struct closure *cl; - }; struct bch_devs_list failed; u8 order; @@ -82,7 +79,6 @@ struct bch_write_bio { used_mempool:1; unsigned submit_time_us; - void *data; struct bio bio; }; @@ -94,7 +90,7 @@ struct bch_write_op { unsigned written; /* sectors */ u16 flags; - s8 error; + s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ unsigned csum_type:4; unsigned compression_type:4; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 0133a31e..811f7a5c 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -88,6 +88,9 @@ struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, if (!entry) return NULL; + if (!entry->u64s) + return ERR_PTR(-EINVAL); + k = entry->start; *level = entry->level; *level = entry->level; @@ -415,6 +418,7 @@ static struct nonce journal_nonce(const struct jset *jset) }}; } +/* this fills in a range with empty jset_entries: */ static void journal_entry_null_range(void *start, void *end) { struct jset_entry *entry; @@ -423,7 +427,7 @@ static void journal_entry_null_range(void *start, void *end) memset(entry, 0, sizeof(*entry)); } -static int journal_validate_key(struct bch_fs *c, struct jset *j, +static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, struct bkey_i *k, enum bkey_type key_type, const char *type) @@ -458,7 +462,7 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j, return 0; } - if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN) + if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) bch2_bkey_swab(key_type, NULL, bkey_to_packed(k)); invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k)); @@ -497,26 +501,27 @@ fsck_err: #define journal_entry_err_on(cond, c, msg, ...) \ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) -static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j, +static int journal_entry_validate_entries(struct bch_fs *c, struct jset *jset, int write) { struct jset_entry *entry; int ret = 0; - vstruct_for_each(j, entry) { + vstruct_for_each(jset, entry) { + void *next = vstruct_next(entry); struct bkey_i *k; if (journal_entry_err_on(vstruct_next(entry) > - vstruct_last(j), c, + vstruct_last(jset), c, "journal entry extends past end of jset")) { - j->u64s = cpu_to_le32((u64 *) entry - j->_data); + jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); break; } switch (entry->type) { case JOURNAL_ENTRY_BTREE_KEYS: vstruct_for_each(entry, k) { - ret = journal_validate_key(c, j, entry, k, + ret = journal_validate_key(c, jset, entry, k, bkey_type(entry->level, entry->btree_id), "key"); @@ -531,12 +536,17 @@ static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j, if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, c, "invalid btree root journal entry: wrong number of keys")) { - journal_entry_null_range(entry, - vstruct_next(entry)); + /* + * we don't want to null out this jset_entry, + * just the contents, so that later we can tell + * we were _supposed_ to have a btree root + */ + entry->u64s = 0; + journal_entry_null_range(vstruct_next(entry), next); continue; } - ret = journal_validate_key(c, j, entry, k, + ret = journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE, "btree root"); if (ret) goto fsck_err; @@ -566,21 +576,21 @@ fsck_err: } static int journal_entry_validate(struct bch_fs *c, - struct jset *j, u64 sector, + struct jset *jset, u64 sector, unsigned bucket_sectors_left, unsigned sectors_read, int write) { - size_t bytes = vstruct_bytes(j); + size_t bytes = vstruct_bytes(jset); struct bch_csum csum; int ret = 0; - if (le64_to_cpu(j->magic) != jset_magic(c)) + if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) { + if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) { bch_err(c, "unknown journal entry version %u", - le32_to_cpu(j->version)); + le32_to_cpu(jset->version)); return BCH_FSCK_UNKNOWN_VERSION; } @@ -594,26 +604,26 @@ static int journal_entry_validate(struct bch_fs *c, if (bytes > sectors_read << 9) return JOURNAL_ENTRY_REREAD; - if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c, + if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, "journal entry with unknown csum type %llu sector %lluu", - JSET_CSUM_TYPE(j), sector)) + JSET_CSUM_TYPE(jset), sector)) return JOURNAL_ENTRY_BAD; - csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); - if (journal_entry_err_on(bch2_crc_cmp(csum, j->csum), c, + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, "journal checksum bad, sector %llu", sector)) { /* XXX: retry IO, when we start retrying checksum errors */ /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; } - bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, - vstruct_end(j) - (void *) j->encrypted_start); + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); - if (journal_entry_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c, + if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, "invalid journal entry: last_seq > seq")) - j->last_seq = j->seq; + jset->last_seq = jset->seq; return 0; fsck_err: @@ -960,6 +970,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct bch_dev *ca; u64 cur_seq, end_seq; unsigned iter, keys = 0, entries = 0; + size_t nr; int ret = 0; closure_init_stack(&jlist.cl); @@ -994,12 +1005,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) goto fsck_err; if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_sb_has_replicas_devlist(c, &i->devs, - BCH_DATA_JOURNAL), c, + fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL, + i->devs), c, "superblock not marked as containing replicas (type %u)", BCH_DATA_JOURNAL)) { - ret = bch2_check_mark_super_devlist(c, &i->devs, - BCH_DATA_JOURNAL); + ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, + i->devs); if (ret) return ret; } @@ -1007,9 +1018,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) i = list_last_entry(list, struct journal_replay, list); - unfixable_fsck_err_on(le64_to_cpu(i->j.seq) - - le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c, - "too many journal entries open for refcount fifo"); + nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; + + if (nr > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%zu open entries)", nr); + return -ENOMEM; + } + } atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); @@ -1131,18 +1149,19 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) #endif } -static void __journal_entry_new(struct journal *j, int count) +static void journal_pin_new_entry(struct journal *j, int count) { - struct journal_entry_pin_list *p = fifo_push_ref(&j->pin); + struct journal_entry_pin_list *p; /* * The fifo_push() needs to happen at the same time as j->seq is * incremented for last_seq() to be calculated correctly */ + p = fifo_push_ref(&j->pin); atomic64_inc(&j->seq); - BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != - &fifo_peek_back(&j->pin)); + EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != + &fifo_peek_back(&j->pin)); INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); @@ -1150,13 +1169,10 @@ static void __journal_entry_new(struct journal *j, int count) p->devs.nr = 0; } -static void __bch2_journal_next_entry(struct journal *j) +static void bch2_journal_buf_init(struct journal *j) { - struct journal_buf *buf; + struct journal_buf *buf = journal_cur_buf(j); - __journal_entry_new(j, 1); - - buf = journal_cur_buf(j); memset(buf->has_inode, 0, sizeof(buf->has_inode)); memset(buf->data, 0, sizeof(*buf->data)); @@ -1208,22 +1224,24 @@ static enum { } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - journal_reclaim_fast(j); - clear_bit(JOURNAL_NEED_WRITE, &j->flags); buf = &j->buf[old.idx]; buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - buf->data->last_seq = cpu_to_le64(last_seq(j)); j->prev_buf_sectors = vstruct_blocks_plus(buf->data, c->block_bits, journal_entry_u64s_reserve(buf)) * c->opts.block_size; - BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); - __bch2_journal_next_entry(j); + journal_reclaim_fast(j); + /* XXX: why set this here, and not in journal_write()? */ + buf->data->last_seq = cpu_to_le64(last_seq(j)); + + journal_pin_new_entry(j, 1); + + bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); spin_unlock(&j->lock); @@ -1352,12 +1370,20 @@ static int journal_entry_sectors(struct journal *j) /* * should _only_ called from journal_res_get() - when we actually want a * journal reservation - journal entry is open means journal is dirty: + * + * returns: + * 1: success + * 0: journal currently full (must wait) + * -EROFS: insufficient rw devices + * -EIO: journal error */ static int journal_entry_open(struct journal *j) { struct journal_buf *buf = journal_cur_buf(j); + union journal_res_state old, new; ssize_t u64s; - int ret = 0, sectors; + int sectors; + u64 v; lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); @@ -1387,41 +1413,36 @@ static int journal_entry_open(struct journal *j) BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL); - if (u64s > le32_to_cpu(buf->data->u64s)) { - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); + if (u64s <= le32_to_cpu(buf->data->u64s)) + return 0; - /* - * Must be set before marking the journal entry as open: - */ - j->cur_entry_u64s = u64s; + /* + * Must be set before marking the journal entry as open: + */ + j->cur_entry_u64s = u64s; - do { - old.v = new.v = v; + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return false; + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return -EIO; - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - ret = 1; + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); - wake_up(&j->wait); + if (j->res_get_blocked_start) + __bch2_time_stats_update(j->blocked_time, + j->res_get_blocked_start); + j->res_get_blocked_start = 0; - if (j->res_get_blocked_start) { - __bch2_time_stats_update(j->blocked_time, - j->res_get_blocked_start); - j->res_get_blocked_start = 0; - } - - mod_delayed_work(system_freezable_wq, - &j->write_work, - msecs_to_jiffies(j->write_delay_ms)); - } - - return ret; + mod_delayed_work(system_freezable_wq, + &j->write_work, + msecs_to_jiffies(j->write_delay_ms)); + wake_up(&j->wait); + return 1; } void bch2_journal_start(struct bch_fs *c) @@ -1438,14 +1459,15 @@ void bch2_journal_start(struct bch_fs *c) set_bit(JOURNAL_STARTED, &j->flags); while (atomic64_read(&j->seq) < new_seq) - __journal_entry_new(j, 0); + journal_pin_new_entry(j, 0); /* * journal_buf_switch() only inits the next journal entry when it * closes an open journal entry - the very first journal entry gets * initialized here: */ - __bch2_journal_next_entry(j); + journal_pin_new_entry(j, 1); + bch2_journal_buf_init(j); /* * Adding entries to the next journal entry before allocating space on @@ -1476,7 +1498,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) struct bkey_i *k, *_n; struct jset_entry *entry; struct journal_replay *i, *n; - int ret = 0, did_replay = 0; + int ret = 0; list_for_each_entry_safe(i, n, list, list) { j->replay_pin_list = @@ -1514,7 +1536,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) } cond_resched(); - did_replay = true; } if (atomic_dec_and_test(&j->replay_pin_list->count)) @@ -1524,22 +1545,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) j->replay_pin_list = NULL; bch2_journal_set_replay_done(j); - - if (did_replay) { - bch2_journal_flush_pins(&c->journal, U64_MAX); - - /* - * Write a new journal entry _before_ we start journalling new data - - * otherwise, we could end up with btree node bsets with journal seqs - * arbitrarily far in the future vs. the most recently written journal - * entry on disk, if we crash before writing the next journal entry: - */ - ret = bch2_journal_meta(j); - if (ret) { - bch_err(c, "journal replay: error %d flushing journal", ret); - goto err; - } - } + ret = bch2_journal_flush_all_pins(j); err: bch2_journal_entries_free(list); return ret; @@ -1654,7 +1660,7 @@ err: return ret; } -int bch2_dev_journal_alloc(struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca) { unsigned nr; @@ -1670,7 +1676,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 10, (1 << 20) / ca->mi.bucket_size)); - return bch2_set_nr_journal_buckets(ca->fs, ca, nr); + return bch2_set_nr_journal_buckets(c, ca, nr); } /* Journalling */ @@ -1723,6 +1729,7 @@ static inline void __journal_pin_add(struct journal *j, list_add(&pin->list, &pin_list->list); else INIT_LIST_HEAD(&pin->list); + wake_up(&j->wait); } static void journal_pin_add_entry(struct journal *j, @@ -1730,9 +1737,9 @@ static void journal_pin_add_entry(struct journal *j, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - spin_lock_irq(&j->pin_lock); + spin_lock(&j->lock); __journal_pin_add(j, pin_list, pin, flush_fn); - spin_unlock_irq(&j->pin_lock); + spin_unlock(&j->lock); } void bch2_journal_pin_add(struct journal *j, @@ -1744,9 +1751,9 @@ void bch2_journal_pin_add(struct journal *j, ? journal_seq_pin(j, res->seq) : j->replay_pin_list; - spin_lock_irq(&j->pin_lock); + spin_lock(&j->lock); __journal_pin_add(j, pin_list, pin, flush_fn); - spin_unlock_irq(&j->pin_lock); + spin_unlock(&j->lock); } static inline bool __journal_pin_drop(struct journal *j, @@ -1766,13 +1773,12 @@ static inline bool __journal_pin_drop(struct journal *j, void bch2_journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { - unsigned long flags; bool wakeup = false; - spin_lock_irqsave(&j->pin_lock, flags); + spin_lock(&j->lock); if (journal_pin_active(pin)) wakeup = __journal_pin_drop(j, pin); - spin_unlock_irqrestore(&j->pin_lock, flags); + spin_unlock(&j->lock); /* * Unpinning a journal entry make make journal_next_bucket() succeed, if @@ -1789,7 +1795,7 @@ void bch2_journal_pin_add_if_older(struct journal *j, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - spin_lock_irq(&j->pin_lock); + spin_lock(&j->lock); if (journal_pin_active(src_pin) && (!journal_pin_active(pin) || @@ -1800,24 +1806,19 @@ void bch2_journal_pin_add_if_older(struct journal *j, __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); } - spin_unlock_irq(&j->pin_lock); + spin_unlock(&j->lock); } static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) { struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret = NULL; + struct journal_entry_pin *ret; unsigned iter; - /* so we don't iterate over empty fifo entries below: */ - if (!atomic_read(&fifo_peek_front(&j->pin).count)) { - spin_lock(&j->lock); - journal_reclaim_fast(j); - spin_unlock(&j->lock); - } + /* no need to iterate over empty fifo entries: */ + journal_reclaim_fast(j); - spin_lock_irq(&j->pin_lock); fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { if (journal_pin_seq(j, pin_list) > seq_to_flush) break; @@ -1828,71 +1829,82 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) /* must be list_del_init(), see bch2_journal_pin_drop() */ list_move(&ret->list, &pin_list->flushed); *seq = journal_pin_seq(j, pin_list); - break; + return ret; } } - spin_unlock_irq(&j->pin_lock); - return ret; + return NULL; } -static bool journal_flush_done(struct journal *j, u64 seq_to_flush) +static struct journal_entry_pin * +journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) { - bool ret; + struct journal_entry_pin *ret; spin_lock(&j->lock); - journal_reclaim_fast(j); - - ret = (fifo_used(&j->pin) == 1 && - atomic_read(&fifo_peek_front(&j->pin).count) == 1) || - last_seq(j) > seq_to_flush; + ret = __journal_get_next_pin(j, seq_to_flush, seq); spin_unlock(&j->lock); return ret; } -void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) +static int journal_flush_done(struct journal *j, u64 seq_to_flush, + struct journal_entry_pin **pin, + u64 *pin_seq) { - struct journal_entry_pin *pin; - u64 pin_seq; + int ret; - if (!test_bit(JOURNAL_STARTED, &j->flags)) - return; + *pin = NULL; - while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) - pin->flush(j, pin, pin_seq); + ret = bch2_journal_error(j); + if (ret) + return ret; + spin_lock(&j->lock); /* * If journal replay hasn't completed, the unreplayed journal entries - * hold refs on their corresponding sequence numbers and thus this would - * deadlock: + * hold refs on their corresponding sequence numbers */ - if (!test_bit(JOURNAL_REPLAY_DONE, &j->flags)) - return; + ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) || + !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + last_seq(j) > seq_to_flush || + (fifo_used(&j->pin) == 1 && + atomic_read(&fifo_peek_front(&j->pin).count) == 1); + spin_unlock(&j->lock); - wait_event(j->wait, - journal_flush_done(j, seq_to_flush) || - bch2_journal_error(j)); + return ret; } -int bch2_journal_flush_all_pins(struct journal *j) +int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin *pin; + u64 pin_seq; bool flush; if (!test_bit(JOURNAL_STARTED, &j->flags)) return 0; - - bch2_journal_flush_pins(j, U64_MAX); +again: + wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq)); + if (pin) { + /* flushing a journal pin might cause a new one to be added: */ + pin->flush(j, pin, pin_seq); + goto again; + } spin_lock(&j->lock); flush = last_seq(j) != j->last_seq_ondisk || - c->btree_roots_dirty; + (seq_to_flush == U64_MAX && c->btree_roots_dirty); spin_unlock(&j->lock); return flush ? bch2_journal_meta(j) : 0; } +int bch2_journal_flush_all_pins(struct journal *j) +{ + return bch2_journal_flush_pins(j, U64_MAX); +} + static bool should_discard_bucket(struct journal *j, struct journal_device *ja) { bool ret; @@ -2179,14 +2191,15 @@ static void journal_write_done(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_prev_buf(j); - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&w->key); + struct bch_devs_list devs = + bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); - if (!bch2_extent_nr_ptrs(e)) { + if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); goto err; } - if (bch2_check_mark_super(c, e, BCH_DATA_JOURNAL)) + if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs)) goto err; out: __bch2_time_stats_update(j->write_time, j->write_start_time); @@ -2194,8 +2207,7 @@ out: spin_lock(&j->lock); j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); - journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = - bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); + journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs; /* * Updating last_seq_ondisk may let journal_reclaim_work() discard more @@ -2358,7 +2370,7 @@ static void journal_write(struct closure *cl) } no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) + extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) ptr->offset += sectors; continue_at(cl, journal_write_done, system_highpri_wq); @@ -2737,7 +2749,9 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) seq = journal_pin_seq(j, p); spin_unlock(&j->lock); - bch2_journal_flush_pins(j, seq); + ret = bch2_journal_flush_pins(j, seq); + if (ret) + return ret; mutex_lock(&c->replicas_gc_lock); bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); @@ -2751,7 +2765,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) seq++; spin_unlock(&j->lock); - ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL); + ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs); spin_lock(&j->lock); } spin_unlock(&j->lock); @@ -2857,7 +2871,6 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; spin_lock_init(&j->lock); - spin_lock_init(&j->pin_lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); @@ -2956,7 +2969,7 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf) ssize_t ret = 0; unsigned i; - spin_lock_irq(&j->pin_lock); + spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, i) { ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%llu: count %u\n", @@ -2977,7 +2990,7 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf) "\t%p %pf\n", pin, pin->flush); } - spin_unlock_irq(&j->pin_lock); + spin_unlock(&j->lock); return ret; } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 61197e57..5abf356e 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -165,7 +165,7 @@ void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, struct journal_entry_pin *, journal_pin_flush_fn); -void bch2_journal_flush_pins(struct journal *, u64); +int bch2_journal_flush_pins(struct journal *, u64); int bch2_journal_flush_all_pins(struct journal *); struct closure; @@ -390,7 +390,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) ssize_t bch2_journal_print_debug(struct journal *, char *); ssize_t bch2_journal_print_pins(struct journal *, char *); -int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 66923cf4..5eea6579 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -169,12 +169,6 @@ struct journal { DECLARE_FIFO(struct journal_entry_pin_list, pin); struct journal_entry_pin_list *replay_pin_list; - /* - * Protects the pin lists - the fifo itself is still protected by - * j->lock though: - */ - spinlock_t pin_lock; - struct mutex blacklist_lock; struct list_head seq_blacklist; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 328316a1..2033db81 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -16,13 +16,8 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e) { struct bch_dev *ca = arg; - const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) - if (ptr->dev == ca->dev_idx) - return true; - - return false; + return bch2_extent_has_device(e, ca->dev_idx); } #define MAX_DATA_OFF_ITER 10 @@ -32,30 +27,17 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, { struct btree_iter iter; struct bkey_s_c k; - u64 keys_moved, sectors_moved; + struct bch_move_stats stats; unsigned pass = 0; int ret = 0; - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER))) return 0; /* - * In theory, only one pass should be necessary as we've - * quiesced all writes before calling this. - * - * However, in practice, more than one pass may be necessary: - * - Some move fails due to an error. We can can find this out - * from the moving_context. - * - Some key swap failed because some of the pointers in the - * key in the tree changed due to caching behavior, btree gc - * pruning stale pointers, or tiering (if the device being - * removed is in tier 0). A smarter bkey_cmpxchg would - * handle these cases. - * - * Thus this scans the tree one more time than strictly necessary, - * but that can be viewed as a verification pass. + * XXX: we should be able to do this in one pass, but bch2_move_data() + * can spuriously fail to move an extent due to racing with other move + * operations */ do { ret = bch2_move_data(c, NULL, @@ -65,15 +47,14 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, 0, ca->dev_idx, migrate_pred, ca, - &keys_moved, - §ors_moved); + &stats); if (ret) { bch_err(c, "error migrating data: %i", ret); return ret; } - } while (keys_moved && pass++ < MAX_DATA_OFF_ITER); + } while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER); - if (keys_moved) { + if (atomic64_read(&stats.keys_moved)) { bch_err(c, "unable to migrate all data in %d iterations", MAX_DATA_OFF_ITER); return -1; @@ -83,11 +64,7 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) { - if (!bkey_extent_is_data(k.k)) - continue; - - ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k), - BCH_DATA_USER); + ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k)); if (ret) { bch_err(c, "error migrating data %i from check_mark_super()", ret); break; @@ -99,107 +76,34 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, return ret; } -static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, - enum btree_id id) -{ - struct btree_iter iter; - struct btree *b; - int ret; - - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - - if (!bch2_extent_has_device(e, ca->dev_idx)) - continue; - - ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0); - if (ret) { - bch2_btree_iter_unlock(&iter); - return ret; - } - - bch2_btree_iter_set_locks_want(&iter, 0); - } - ret = bch2_btree_iter_unlock(&iter); - if (ret) - return ret; /* btree IO error */ - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - - BUG_ON(bch2_extent_has_device(e, ca->dev_idx)); - } - bch2_btree_iter_unlock(&iter); - } - - return 0; -} - -/* - * This moves only the meta-data off, leaving the data (if any) in place. - * The data is moved off by bch_move_data_off_device, if desired, and - * called first. - * - * Before calling this, allocation of buckets to the device must have - * been disabled, as else we'll continue to write meta-data to the device - * when new buckets are picked for meta-data writes. - * In addition, the copying gc and allocator threads for the device - * must have been stopped. The allocator thread is the only thread - * that writes prio/gen information. - * - * Meta-data consists of: - * - Btree nodes - * - Prio/gen information - * - Journal entries - * - Superblock - * - * This has to move the btree nodes and the journal only: - * - prio/gen information is not written once the allocator thread is stopped. - * also, as the prio/gen information is per-device it is not moved. - * - the superblock will be written by the caller once after everything - * is stopped. - * - * Note that currently there is no way to stop btree node and journal - * meta-data writes to a device without moving the meta-data because - * once a bucket is open for a btree node, unless a replacement btree - * node is allocated (and the tree updated), the bucket will continue - * to be written with updates. Similarly for the journal (it gets - * written until filled). - * - * This routine leaves the data (if any) in place. Whether the data - * should be moved off is a decision independent of whether the meta - * data should be moved off and stopped: - * - * - For device removal, both data and meta-data are moved off, in - * that order. - * - * - However, for turning a device read-only without removing it, only - * meta-data is moved off since that's the only way to prevent it - * from being written. Data is left in the device, but no new data - * is written. - */ - static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca, int flags) { - unsigned i; + struct btree_iter iter; + struct btree *b; int ret = 0; + unsigned id; - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - - if (!(bch2_dev_has_data(c, ca) & - ((1 << BCH_DATA_JOURNAL)| - (1 << BCH_DATA_BTREE)))) + if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE))) return 0; mutex_lock(&c->replicas_gc_lock); bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); - for (i = 0; i < BTREE_ID_NR; i++) { - ret = bch2_move_btree_off(c, ca, i); + for (id = 0; id < BTREE_ID_NR; id++) { + for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); + + if (!bch2_extent_has_device(e, ca->dev_idx)) + continue; + + ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0); + if (ret) { + bch2_btree_iter_unlock(&iter); + goto err; + } + } + ret = bch2_btree_iter_unlock(&iter); if (ret) goto err; } @@ -211,6 +115,9 @@ err: int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags) { + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW && + bch2_dev_is_online(ca)); + return bch2_dev_usrdata_migrate(c, ca, flags) ?: bch2_dev_metadata_migrate(c, ca, flags); } @@ -233,17 +140,6 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, return 0; } -/* - * This doesn't actually move any data -- it marks the keys as bad - * if they contain a pointer to a device that is forcibly removed - * and don't have other valid pointers. If there are valid pointers, - * the necessary pointers to the removed device are replaced with - * bad pointers instead. - * - * This is only called if bch_move_data_off_device above failed, meaning - * that we've already tried to move the data MAX_DATA_OFF_ITER times and - * are not likely to succeed if we try again. - */ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct bkey_s_c k; @@ -260,11 +156,15 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = btree_iter_err(k))) { - if (!bkey_extent_is_data(k.k)) - goto advance; - - if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) - goto advance; + if (!bkey_extent_is_data(k.k) || + !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) { + ret = bch2_check_mark_super(c, BCH_DATA_USER, + bch2_bkey_devs(k)); + if (ret) + break; + bch2_btree_iter_advance_pos(&iter); + continue; + } bkey_reassemble(&tmp.key, k); e = bkey_i_to_s_extent(&tmp.key); @@ -280,8 +180,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) */ bch2_extent_normalize(c, e.s); - if (bkey_extent_is_data(e.k) && - (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER))) + ret = bch2_check_mark_super(c, BCH_DATA_USER, + bch2_bkey_devs(bkey_i_to_s_c(&tmp.key))); + if (ret) break; iter.pos = bkey_start_pos(&tmp.key.k); @@ -300,16 +201,6 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = 0; if (ret) break; - - continue; -advance: - if (bkey_extent_is_data(k.k)) { - ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k), - BCH_DATA_USER); - if (ret) - break; - } - bch2_btree_iter_advance_pos(&iter); } bch2_btree_iter_unlock(&iter); @@ -346,8 +237,8 @@ retry: dev_idx)) { bch2_btree_iter_set_locks_want(&iter, 0); - ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), - BCH_DATA_BTREE); + ret = bch2_check_mark_super(c, BCH_DATA_BTREE, + bch2_bkey_devs(bkey_i_to_s_c(&b->key))); if (ret) goto err; } else { diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a3de3b05..7c7f436c 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -31,15 +31,10 @@ struct moving_context { /* Closure for waiting on all reads and writes to complete */ struct closure cl; - /* Key and sector moves issued, updated from submission context */ - u64 keys_moved; - u64 sectors_moved; - atomic64_t sectors_raced; + struct bch_move_stats *stats; struct list_head reads; - atomic_t sectors_in_flight; - wait_queue_head_t wait; }; @@ -116,8 +111,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_extent_normalize(c, extent_i_to_s(insert).s); bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert)); - ret = bch2_check_mark_super(c, extent_i_to_s_c(insert), - BCH_DATA_USER); + ret = bch2_check_mark_super(c, BCH_DATA_USER, + bch2_extent_devs(extent_i_to_s_c(insert))); if (ret) break; @@ -145,7 +140,7 @@ next: nomatch: if (m->ctxt) atomic64_add(k.k->p.offset - iter.pos.offset, - &m->ctxt->sectors_raced); + &m->ctxt->stats->sectors_raced); atomic_long_inc(&c->extent_migrate_raced); trace_move_race(&new->k); bch2_btree_iter_advance_pos(&iter); @@ -303,8 +298,8 @@ static int bch2_move_extent(struct bch_fs *c, io->write.op.devs = devs; io->write.op.write_point = wp; - ctxt->keys_moved++; - ctxt->sectors_moved += k.k->size; + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); trace_move_extent(k.k); @@ -353,24 +348,6 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->sectors_in_flight) != sectors_pending); } -static void bch2_move_ctxt_exit(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight)); - closure_sync(&ctxt->cl); - - EBUG_ON(!list_empty(&ctxt->reads)); - EBUG_ON(atomic_read(&ctxt->sectors_in_flight)); -} - -static void bch2_move_ctxt_init(struct moving_context *ctxt) -{ - memset(ctxt, 0, sizeof(*ctxt)); - closure_init_stack(&ctxt->cl); - - INIT_LIST_HEAD(&ctxt->reads); - init_waitqueue_head(&ctxt->wait); -} - int bch2_move_data(struct bch_fs *c, struct bch_ratelimit *rate, unsigned sectors_in_flight, @@ -379,20 +356,21 @@ int bch2_move_data(struct bch_fs *c, int btree_insert_flags, int move_device, move_pred_fn pred, void *arg, - u64 *keys_moved, - u64 *sectors_moved) + struct bch_move_stats *stats) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct moving_context ctxt; + struct moving_context ctxt = { .stats = stats }; struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter; BKEY_PADDED(k) tmp; struct bkey_s_c k; u64 cur_inum = U64_MAX; int ret = 0; - bch2_move_ctxt_init(&ctxt); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + memset(stats, 0, sizeof(*stats)); + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH); if (rate) @@ -400,7 +378,7 @@ int bch2_move_data(struct bch_fs *c, while (!kthread || !(ret = kthread_should_stop())) { if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) { - bch2_btree_iter_unlock(&iter); + bch2_btree_iter_unlock(&stats->iter); move_ctxt_wait_event(&ctxt, atomic_read(&ctxt.sectors_in_flight) < sectors_in_flight); @@ -408,11 +386,11 @@ int bch2_move_data(struct bch_fs *c, if (rate && bch2_ratelimit_delay(rate) && - (bch2_btree_iter_unlock(&iter), + (bch2_btree_iter_unlock(&stats->iter), (ret = bch2_ratelimit_wait_freezable_stoppable(rate)))) break; peek: - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(&stats->iter); if (!k.k) break; ret = btree_iter_err(k); @@ -420,13 +398,13 @@ peek: break; if (!bkey_extent_is_data(k.k)) - goto next; + goto next_nondata; if (cur_inum != k.k->p.inode) { struct bch_inode_unpacked inode; /* don't hold btree locks while looking up inode: */ - bch2_btree_iter_unlock(&iter); + bch2_btree_iter_unlock(&stats->iter); opts = bch2_opts_to_inode_opts(c->opts); if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) @@ -441,7 +419,7 @@ peek: /* unlock before doing IO: */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_btree_iter_unlock(&stats->iter); if (bch2_move_extent(c, &ctxt, devs, wp, btree_insert_flags, @@ -454,17 +432,24 @@ peek: if (rate) bch2_ratelimit_increment(rate, k.k->size); next: - bch2_btree_iter_advance_pos(&iter); - bch2_btree_iter_cond_resched(&iter); + atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k), + &stats->sectors_seen); +next_nondata: + bch2_btree_iter_advance_pos(&stats->iter); + bch2_btree_iter_cond_resched(&stats->iter); } - bch2_btree_iter_unlock(&iter); - bch2_move_ctxt_exit(&ctxt); + bch2_btree_iter_unlock(&stats->iter); - trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved); + move_ctxt_wait_event(&ctxt, !atomic_read(&ctxt.sectors_in_flight)); + closure_sync(&ctxt.cl); - *keys_moved = ctxt.keys_moved; - *sectors_moved = ctxt.sectors_moved; + EBUG_ON(!list_empty(&ctxt.reads)); + EBUG_ON(atomic_read(&ctxt.sectors_in_flight)); + + trace_move_data(c, + atomic64_read(&stats->sectors_moved), + atomic64_read(&stats->keys_moved)); return ret; } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 2e884ce0..24d6ddfa 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -1,6 +1,7 @@ #ifndef _BCACHEFS_MOVE_H #define _BCACHEFS_MOVE_H +#include "btree_iter.h" #include "buckets.h" #include "io_types.h" @@ -25,10 +26,19 @@ void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *); typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent); +struct bch_move_stats { + struct btree_iter iter; + + atomic64_t keys_moved; + atomic64_t sectors_moved; + atomic64_t sectors_seen; + atomic64_t sectors_raced; +}; + int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, unsigned, struct bch_devs_mask *, struct write_point_specifier, int, int, move_pred_fn, void *, - u64 *, u64 *); + struct bch_move_stats *); #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 90eb4ca2..d6f2968e 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -100,7 +100,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) copygc_heap *h = &ca->copygc_heap; struct copygc_heap_entry e, *i; struct bucket_array *buckets; - u64 keys_moved, sectors_moved; + struct bch_move_stats move_stats; u64 sectors_to_move = 0, sectors_not_moved = 0; u64 buckets_to_move, buckets_not_moved = 0; size_t b; @@ -167,8 +167,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) BTREE_INSERT_USE_RESERVE, ca->dev_idx, copygc_pred, ca, - &keys_moved, - §ors_moved); + &move_stats); down_read(&ca->bucket_lock); buckets = bucket_array(ca); @@ -189,7 +188,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) buckets_not_moved, buckets_to_move); trace_copygc(ca, - sectors_moved, sectors_not_moved, + atomic64_read(&move_stats.sectors_moved), sectors_not_moved, buckets_to_move, buckets_not_moved); } diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index e6833d95..eae63cf8 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -167,6 +167,27 @@ int bch2_opt_lookup(const char *name) return -1; } +struct synonym { + const char *s1, *s2; +}; + +static const struct synonym bch_opt_synonyms[] = { + { "quota", "usrquota" }, +}; + +static int bch2_mount_opt_lookup(const char *name) +{ + const struct synonym *i; + + for (i = bch_opt_synonyms; + i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + i++) + if (!strcmp(name, i->s1)) + name = i->s2; + + return bch2_opt_lookup(name); +} + int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res) { ssize_t ret; @@ -211,7 +232,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) val = opt; if (val) { - id = bch2_opt_lookup(name); + id = bch2_mount_opt_lookup(name); if (id < 0) goto bad_opt; @@ -219,12 +240,12 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) if (ret < 0) goto bad_val; } else { - id = bch2_opt_lookup(name); + id = bch2_mount_opt_lookup(name); v = 1; if (id < 0 && !strncmp("no", name, 2)) { - id = bch2_opt_lookup(name + 2); + id = bch2_mount_opt_lookup(name + 2); v = 0; } @@ -242,6 +263,11 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) goto bad_opt; + if ((id == Opt_usrquota || + id == Opt_grpquota) && + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) + goto bad_opt; + bch2_opt_set_by_id(opts, id, v); } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 126056e6..5d42dd5f 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -112,6 +112,15 @@ enum opt_type { BCH_OPT(acl, u8, OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_POSIX_ACL, true) \ + BCH_OPT(usrquota, u8, OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_USRQUOTA, false) \ + BCH_OPT(grpquota, u8, OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_GRPQUOTA, false) \ + BCH_OPT(prjquota, u8, OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false) \ BCH_OPT(degraded, u8, OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false) \ @@ -171,7 +180,7 @@ static const struct bch_opts bch2_opts_default = { #define opt_defined(_opts, _name) ((_opts)._name##_defined) #define opt_get(_opts, _name) \ - (opt_defined(_opts, _name) ? _opts._name : bch2_opts_default._name) + (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) #define opt_set(_opts, _name, _v) \ do { \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c new file mode 100644 index 00000000..c550fd9e --- /dev/null +++ b/libbcachefs/quota.c @@ -0,0 +1,786 @@ +#include "bcachefs.h" +#include "btree_update.h" +#include "inode.h" +#include "quota.h" +#include "super-io.h" + +static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_quota dq; + + if (k.k->p.inode >= QTYP_NR) + return "invalid quota type"; + + switch (k.k->type) { + case BCH_QUOTA: { + dq = bkey_s_c_to_quota(k); + + if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) + return "incorrect value size"; + + return NULL; + } + default: + return "invalid type"; + } +} + +static const char * const bch2_quota_counters[] = { + "space", + "inodes", +}; + +static void bch2_quota_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) +{ + char *out = buf, *end= buf + size; + struct bkey_s_c_quota dq; + unsigned i; + + switch (k.k->type) { + case BCH_QUOTA: + dq = bkey_s_c_to_quota(k); + + for (i = 0; i < Q_COUNTERS; i++) + out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu", + bch2_quota_counters[i], + le64_to_cpu(dq.v->c[i].hardlimit), + le64_to_cpu(dq.v->c[i].softlimit)); + break; + } +} + +const struct bkey_ops bch2_bkey_quota_ops = { + .key_invalid = bch2_quota_invalid, + .val_to_text = bch2_quota_to_text, +}; + +#ifdef CONFIG_BCACHEFS_QUOTA + +#include +#include +#include + +static inline unsigned __next_qtype(unsigned i, unsigned qtypes) +{ + qtypes >>= i; + return qtypes ? i + __ffs(qtypes) : QTYP_NR; +} + +#define for_each_set_qtype(_c, _i, _q, _qtypes) \ + for (_i = 0; \ + (_i = __next_qtype(_i, _qtypes), \ + _q = &(_c)->quotas[_i], \ + _i < QTYP_NR); \ + _i++) + +static inline unsigned enabled_qtypes(struct bch_fs *c) +{ + return ((c->opts.usrquota << QTYP_USR)| + (c->opts.grpquota << QTYP_GRP)| + (c->opts.prjquota << QTYP_PRJ)); +} + +static bool ignore_hardlimit(struct bch_memquota_type *q) +{ + if (capable(CAP_SYS_RESOURCE)) + return true; +#if 0 + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; + + return capable(CAP_SYS_RESOURCE) && + (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || + !(info->dqi_flags & DQF_ROOT_SQUASH)); +#endif + return false; +} + +enum quota_msg { + SOFTWARN, /* Softlimit reached */ + SOFTLONGWARN, /* Grace time expired */ + HARDWARN, /* Hardlimit reached */ + + HARDBELOW, /* Usage got below inode hardlimit */ + SOFTBELOW, /* Usage got below inode softlimit */ +}; + +static int quota_nl[][Q_COUNTERS] = { + [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, + [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, + [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, + [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, + [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, + + [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, + [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, + [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, + [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, + [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, +}; + +struct quota_msgs { + u8 nr; + struct { + u8 qtype; + u8 msg; + } m[QTYP_NR * Q_COUNTERS]; +}; + +static void prepare_msg(unsigned qtype, + enum quota_counters counter, + struct quota_msgs *msgs, + enum quota_msg msg_type) +{ + BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); + + msgs->m[msgs->nr].qtype = qtype; + msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; + msgs->nr++; +} + +static void prepare_warning(struct memquota_counter *qc, + unsigned qtype, + enum quota_counters counter, + struct quota_msgs *msgs, + enum quota_msg msg_type) +{ + if (qc->warning_issued & (1 << msg_type)) + return; + + prepare_msg(qtype, counter, msgs, msg_type); +} + +static void flush_warnings(struct bch_qid qid, + struct super_block *sb, + struct quota_msgs *msgs) +{ + unsigned i; + + for (i = 0; i < msgs->nr; i++) + quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), + sb->s_dev, msgs->m[i].msg); +} + +static int bch2_quota_check_limit(struct bch_fs *c, + unsigned qtype, + struct bch_memquota *mq, + struct quota_msgs *msgs, + enum quota_counters counter, + s64 v, + enum quota_acct_mode mode) +{ + struct bch_memquota_type *q = &c->quotas[qtype]; + struct memquota_counter *qc = &mq->c[counter]; + u64 n = qc->v + v; + + BUG_ON((s64) n < 0); + + if (mode == BCH_QUOTA_NOCHECK) + return 0; + + if (v <= 0) { + if (n < qc->hardlimit && + (qc->warning_issued & (1 << HARDWARN))) { + qc->warning_issued &= ~(1 << HARDWARN); + prepare_msg(qtype, counter, msgs, HARDBELOW); + } + + if (n < qc->softlimit && + (qc->warning_issued & (1 << SOFTWARN))) { + qc->warning_issued &= ~(1 << SOFTWARN); + prepare_msg(qtype, counter, msgs, SOFTBELOW); + } + + qc->warning_issued = 0; + return 0; + } + + if (qc->hardlimit && + qc->hardlimit < n && + !ignore_hardlimit(q)) { + if (mode == BCH_QUOTA_PREALLOC) + return -EDQUOT; + + prepare_warning(qc, qtype, counter, msgs, HARDWARN); + } + + if (qc->softlimit && + qc->softlimit < n && + qc->timer && + ktime_get_real_seconds() >= qc->timer && + !ignore_hardlimit(q)) { + if (mode == BCH_QUOTA_PREALLOC) + return -EDQUOT; + + prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); + } + + if (qc->softlimit && + qc->softlimit < n && + qc->timer == 0) { + if (mode == BCH_QUOTA_PREALLOC) + return -EDQUOT; + + prepare_warning(qc, qtype, counter, msgs, SOFTWARN); + + /* XXX is this the right one? */ + qc->timer = ktime_get_real_seconds() + + q->limits[counter].warnlimit; + } + + return 0; +} + +int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, + enum quota_counters counter, s64 v, + enum quota_acct_mode mode) +{ + unsigned qtypes = enabled_qtypes(c); + struct bch_memquota_type *q; + struct bch_memquota *mq[QTYP_NR]; + struct quota_msgs msgs; + unsigned i; + int ret = 0; + + memset(&msgs, 0, sizeof(msgs)); + + for_each_set_qtype(c, i, q, qtypes) + mutex_lock(&q->lock); + + for_each_set_qtype(c, i, q, qtypes) { + mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); + if (!mq[i]) { + ret = -ENOMEM; + goto err; + } + + ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); + if (ret) + goto err; + } + + for_each_set_qtype(c, i, q, qtypes) + mq[i]->c[counter].v += v; +err: + for_each_set_qtype(c, i, q, qtypes) + mutex_unlock(&q->lock); + + flush_warnings(qid, c->vfs_sb, &msgs); + + return ret; +} + +static void __bch2_quota_transfer(struct bch_memquota *src_q, + struct bch_memquota *dst_q, + enum quota_counters counter, s64 v) +{ + BUG_ON(v > src_q->c[counter].v); + BUG_ON(v + dst_q->c[counter].v < v); + + src_q->c[counter].v -= v; + dst_q->c[counter].v += v; +} + +int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, + struct bch_qid dst, + struct bch_qid src, u64 space) +{ + struct bch_memquota_type *q; + struct bch_memquota *src_q[3], *dst_q[3]; + struct quota_msgs msgs; + unsigned i; + int ret = 0; + + qtypes &= enabled_qtypes(c); + + memset(&msgs, 0, sizeof(msgs)); + + for_each_set_qtype(c, i, q, qtypes) + mutex_lock(&q->lock); + + for_each_set_qtype(c, i, q, qtypes) { + src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); + dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); + + if (!src_q[i] || !dst_q[i]) { + ret = -ENOMEM; + goto err; + } + + ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, + dst_q[i]->c[Q_SPC].v + space, + BCH_QUOTA_PREALLOC); + if (ret) + goto err; + + ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, + dst_q[i]->c[Q_INO].v + 1, + BCH_QUOTA_PREALLOC); + if (ret) + goto err; + } + + for_each_set_qtype(c, i, q, qtypes) { + __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); + __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); + } + +err: + for_each_set_qtype(c, i, q, qtypes) + mutex_unlock(&q->lock); + + flush_warnings(dst, c->vfs_sb, &msgs); + + return ret; +} + +static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_quota dq; + struct bch_memquota_type *q; + struct bch_memquota *mq; + unsigned i; + + BUG_ON(k.k->p.inode >= QTYP_NR); + + switch (k.k->type) { + case BCH_QUOTA: + dq = bkey_s_c_to_quota(k); + q = &c->quotas[k.k->p.inode]; + + mutex_lock(&q->lock); + mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); + if (!mq) { + mutex_unlock(&q->lock); + return -ENOMEM; + } + + for (i = 0; i < Q_COUNTERS; i++) { + mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); + mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); + } + + mutex_unlock(&q->lock); + } + + return 0; +} + +static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0), + BTREE_ITER_PREFETCH, k) { + if (k.k->p.inode != type) + break; + + ret = __bch2_quota_set(c, k); + if (ret) + break; + } + + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +void bch2_fs_quota_exit(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->quotas); i++) + genradix_free(&c->quotas[i].table); +} + +void bch2_fs_quota_init(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->quotas); i++) + mutex_init(&c->quotas[i].lock); +} + +static void bch2_sb_quota_read(struct bch_fs *c) +{ + struct bch_sb_field_quota *sb_quota; + unsigned i, j; + + sb_quota = bch2_sb_get_quota(c->disk_sb); + if (!sb_quota) + return; + + for (i = 0; i < QTYP_NR; i++) { + struct bch_memquota_type *q = &c->quotas[i]; + + for (j = 0; j < Q_COUNTERS; j++) { + q->limits[j].timelimit = + le32_to_cpu(sb_quota->q[i].c[j].timelimit); + q->limits[j].warnlimit = + le32_to_cpu(sb_quota->q[i].c[j].warnlimit); + } + } +} + +int bch2_fs_quota_read(struct bch_fs *c) +{ + unsigned i, qtypes = enabled_qtypes(c); + struct bch_memquota_type *q; + struct btree_iter iter; + struct bch_inode_unpacked u; + struct bkey_s_c k; + int ret; + + mutex_lock(&c->sb_lock); + bch2_sb_quota_read(c); + mutex_unlock(&c->sb_lock); + + for_each_set_qtype(c, i, q, qtypes) { + ret = bch2_quota_init_type(c, i); + if (ret) + return ret; + } + + for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, + BTREE_ITER_PREFETCH, k) { + switch (k.k->type) { + case BCH_INODE_FS: + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + BCH_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + BCH_QUOTA_NOCHECK); + } + } + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +/* Enable/disable/delete quotas for an entire filesystem: */ + +static int bch2_quota_enable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + /* Accounting must be enabled at mount time: */ + if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) + return -EINVAL; + + /* Can't enable enforcement without accounting: */ + if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) + return -EINVAL; + + if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) + return -EINVAL; + + if (uflags & FS_QUOTA_PDQ_ENFD) + return -EINVAL; + + mutex_lock(&c->sb_lock); + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb, true); + + if (uflags & FS_QUOTA_GDQ_ENFD) + SET_BCH_SB_GRPQUOTA(c->disk_sb, true); +#if 0 + if (uflags & FS_QUOTA_PDQ_ENFD) + SET_BCH_SB_PRJQUOTA(c->disk_sb, true); +#endif + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +static int bch2_quota_disable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&c->sb_lock); + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb, false); + + if (uflags & FS_QUOTA_GDQ_ENFD) + SET_BCH_SB_GRPQUOTA(c->disk_sb, false); + + if (uflags & FS_QUOTA_PDQ_ENFD) + SET_BCH_SB_PRJQUOTA(c->disk_sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +static int bch2_quota_remove(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (uflags & FS_USER_QUOTA) { + if (c->opts.usrquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, + POS(QTYP_USR, 0), + POS(QTYP_USR + 1, 0), + ZERO_VERSION, NULL, NULL, NULL); + if (ret) + return ret; + } + + if (uflags & FS_GROUP_QUOTA) { + if (c->opts.grpquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, + POS(QTYP_GRP, 0), + POS(QTYP_GRP + 1, 0), + ZERO_VERSION, NULL, NULL, NULL); + if (ret) + return ret; + } + + if (uflags & FS_PROJ_QUOTA) { + if (c->opts.prjquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, + POS(QTYP_PRJ, 0), + POS(QTYP_PRJ + 1, 0), + ZERO_VERSION, NULL, NULL, NULL); + if (ret) + return ret; + } + + return 0; +} + +/* + * Return quota status information, such as enforcements, quota file inode + * numbers etc. + */ +static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) +{ + struct bch_fs *c = sb->s_fs_info; + unsigned qtypes = enabled_qtypes(c); + unsigned i; + + memset(state, 0, sizeof(*state)); + + for (i = 0; i < QTYP_NR; i++) { + state->s_state[i].flags |= QCI_SYSFILE; + + if (!(qtypes & (1 << i))) + continue; + + state->s_state[i].flags |= QCI_ACCT_ENABLED; + + state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; + state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; + + state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; + state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; + } + + return 0; +} + +/* + * Adjust quota timers & warnings + */ +static int bch2_quota_set_info(struct super_block *sb, int type, + struct qc_info *info) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + struct bch_memquota_type *q; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (type >= QTYP_NR) + return -EINVAL; + + if (!((1 << type) & enabled_qtypes(c))) + return -ESRCH; + + if (info->i_fieldmask & + ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) + return -EINVAL; + + q = &c->quotas[type]; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_quota(c->disk_sb); + if (!sb_quota) { + sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64)); + if (!sb_quota) + return -ENOSPC; + } + + if (info->i_fieldmask & QC_SPC_TIMER) + sb_quota->q[type].c[Q_SPC].timelimit = + cpu_to_le32(info->i_spc_timelimit); + + if (info->i_fieldmask & QC_SPC_WARNS) + sb_quota->q[type].c[Q_SPC].warnlimit = + cpu_to_le32(info->i_spc_warnlimit); + + if (info->i_fieldmask & QC_INO_TIMER) + sb_quota->q[type].c[Q_INO].timelimit = + cpu_to_le32(info->i_ino_timelimit); + + if (info->i_fieldmask & QC_INO_WARNS) + sb_quota->q[type].c[Q_INO].warnlimit = + cpu_to_le32(info->i_ino_warnlimit); + + bch2_sb_quota_read(c); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* Get/set individual quotas: */ + +static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) +{ + dst->d_space = src->c[Q_SPC].v << 9; + dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; + dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; + dst->d_spc_timer = src->c[Q_SPC].timer; + dst->d_spc_warns = src->c[Q_SPC].warns; + + dst->d_ino_count = src->c[Q_INO].v; + dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; + dst->d_ino_softlimit = src->c[Q_INO].softlimit; + dst->d_ino_timer = src->c[Q_INO].timer; + dst->d_ino_warns = src->c[Q_INO].warns; +} + +static int bch2_get_quota(struct super_block *sb, struct kqid kqid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid.type]; + qid_t qid = from_kqid(&init_user_ns, kqid); + struct bch_memquota *mq; + + memset(qdq, 0, sizeof(*qdq)); + + mutex_lock(&q->lock); + mq = genradix_ptr(&q->table, qid); + if (mq) + __bch2_quota_get(qdq, mq); + mutex_unlock(&q->lock); + + return 0; +} + +static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid->type]; + qid_t qid = from_kqid(&init_user_ns, *kqid); + struct genradix_iter iter = genradix_iter_init(&q->table, qid); + struct bch_memquota *mq; + int ret = 0; + + mutex_lock(&q->lock); + + while ((mq = genradix_iter_peek(&iter, &q->table))) { + if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { + __bch2_quota_get(qdq, mq); + *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); + goto found; + } + + genradix_iter_advance(&iter, &q->table); + } + + ret = -ENOENT; +found: + mutex_unlock(&q->lock); + return ret; +} + +static int bch2_set_quota(struct super_block *sb, struct kqid qid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_quota new_quota; + int ret; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + bkey_quota_init(&new_quota.k_i); + new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + + bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p, + BTREE_ITER_WITH_HOLES|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_with_holes(&iter); + + ret = btree_iter_err(k); + if (unlikely(ret)) + return ret; + + switch (k.k->type) { + case BCH_QUOTA: + new_quota.v = *bkey_s_c_to_quota(k).v; + break; + } + + if (qdq->d_fieldmask & QC_SPC_SOFT) + new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit); + if (qdq->d_fieldmask & QC_SPC_HARD) + new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit); + + if (qdq->d_fieldmask & QC_INO_SOFT) + new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_spc_softlimit); + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + BTREE_INSERT_ENTRY(&iter, &new_quota.k_i)); + bch2_btree_iter_unlock(&iter); + + if (ret) + return ret; + + ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); + + return ret; +} + +const struct quotactl_ops bch2_quotactl_operations = { + .quota_enable = bch2_quota_enable, + .quota_disable = bch2_quota_disable, + .rm_xquota = bch2_quota_remove, + + .get_state = bch2_quota_get_state, + .set_info = bch2_quota_set_info, + + .get_dqblk = bch2_get_quota, + .get_nextdqblk = bch2_get_next_quota, + .set_dqblk = bch2_set_quota, +}; + +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h new file mode 100644 index 00000000..09d51a83 --- /dev/null +++ b/libbcachefs/quota.h @@ -0,0 +1,48 @@ +#ifndef _BCACHEFS_QUOTA_H +#define _BCACHEFS_QUOTA_H + +#include "quota_types.h" + +extern const struct bkey_ops bch2_bkey_quota_ops; + +enum quota_acct_mode { + BCH_QUOTA_PREALLOC, + BCH_QUOTA_WARN, + BCH_QUOTA_NOCHECK, +}; + +static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) +{ + return (struct bch_qid) { + .q[QTYP_USR] = u->bi_uid, + .q[QTYP_GRP] = u->bi_gid, + .q[QTYP_PRJ] = u->bi_project, + }; +} + +#ifdef CONFIG_BCACHEFS_QUOTA + +int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, + s64, enum quota_acct_mode); + +int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, + struct bch_qid, u64); + +void bch2_fs_quota_exit(struct bch_fs *); +void bch2_fs_quota_init(struct bch_fs *); +int bch2_fs_quota_read(struct bch_fs *); + +extern const struct quotactl_ops bch2_quotactl_operations; + +#else + +#define bch2_quota_acct(_c, _uid, _gid, _counter, _v) (0) +#define bch2_quota_transfer(_c, _type, _src, _dst, _v) (0) + +static inline void bch2_fs_quota_exit(struct bch_fs *c) {} +static inline void bch2_fs_quota_init(struct bch_fs *c) {} +static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } + +#endif + +#endif /* _BCACHEFS_QUOTA_H */ diff --git a/libbcachefs/quota_types.h b/libbcachefs/quota_types.h new file mode 100644 index 00000000..bcaed4ea --- /dev/null +++ b/libbcachefs/quota_types.h @@ -0,0 +1,36 @@ +#ifndef _BCACHEFS_QUOTA_TYPES_H +#define _BCACHEFS_QUOTA_TYPES_H + +#include + +struct bch_qid { + u32 q[QTYP_NR]; +}; + +struct memquota_counter { + u64 v; + u64 hardlimit; + u64 softlimit; + s64 timer; + int warns; + int warning_issued; +}; + +struct bch_memquota { + struct memquota_counter c[Q_COUNTERS]; +}; + +typedef GENRADIX(struct bch_memquota) bch_memquota_table; + +struct quota_limit { + u32 timelimit; + u32 warnlimit; +}; + +struct bch_memquota_type { + struct quota_limit limits[Q_COUNTERS]; + bch_memquota_table table; + struct mutex lock; +}; + +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 21720186..8dce7dc1 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -330,9 +330,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) return "Btree node size not a power of two"; - if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX) - return "Btree node size too large"; - if (BCH_SB_GC_RESERVE(sb) < 5) return "gc reserve percentage too small"; @@ -383,27 +380,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) /* device open: */ -static const char *bch2_blkdev_open(const char *path, fmode_t mode, - void *holder, struct block_device **ret) -{ - struct block_device *bdev; - - *ret = NULL; - bdev = blkdev_get_by_path(path, mode, holder); - if (bdev == ERR_PTR(-EBUSY)) - return "device busy"; - - if (IS_ERR(bdev)) - return "failed to open device"; - - if (mode & FMODE_WRITE) - bdev_get_queue(bdev)->backing_dev_info->capabilities - |= BDI_CAP_STABLE_WRITES; - - *ret = bdev; - return NULL; -} - static void bch2_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb; @@ -555,44 +531,55 @@ reread: return NULL; } -const char *bch2_read_super(const char *path, - struct bch_opts opts, - struct bch_sb_handle *ret) +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) { - u64 offset = opt_get(opts, sb); + u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; const char *err; - unsigned i; + __le64 *i; + int ret; - memset(ret, 0, sizeof(*ret)); - ret->mode = FMODE_READ; + memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; - if (!opt_get(opts, noexcl)) - ret->mode |= FMODE_EXCL; + if (!opt_get(*opts, noexcl)) + sb->mode |= FMODE_EXCL; - if (!opt_get(opts, nochanges)) - ret->mode |= FMODE_WRITE; + if (!opt_get(*opts, nochanges)) + sb->mode |= FMODE_WRITE; - err = bch2_blkdev_open(path, ret->mode, ret, &ret->bdev); - if (err) - return err; + sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + if (IS_ERR(sb->bdev) && + PTR_ERR(sb->bdev) == -EACCES && + opt_get(*opts, read_only)) { + sb->mode &= ~FMODE_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + if (!IS_ERR(sb->bdev)) + opt_set(*opts, nochanges, true); + } + + if (IS_ERR(sb->bdev)) + return PTR_ERR(sb->bdev); err = "cannot allocate memory"; - if (__bch2_super_realloc(ret, 0)) + ret = __bch2_super_realloc(sb, 0); + if (ret) goto err; + ret = -EFAULT; err = "dynamic fault"; if (bch2_fs_init_fault("read_super")) goto err; - err = read_one_super(ret, offset); + ret = -EINVAL; + err = read_one_super(sb, offset); if (!err) goto got_super; - if (offset != BCH_SB_SECTOR) { - pr_err("error reading superblock: %s", err); + if (opt_defined(*opts, sb)) goto err; - } pr_err("error reading default superblock: %s", err); @@ -600,53 +587,57 @@ const char *bch2_read_super(const char *path, * Error reading primary superblock - read location of backup * superblocks: */ - bio_reset(ret->bio); - ret->bio->bi_bdev = ret->bdev; - ret->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - ret->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); - bio_set_op_attrs(ret->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bio_reset(sb->bio); + sb->bio->bi_bdev = sb->bdev; + sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; + sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); /* * use sb buffer to read layout, since sb buffer is page aligned but * layout won't be: */ - bch2_bio_map(ret->bio, ret->sb); + bch2_bio_map(sb->bio, sb->sb); err = "IO error"; - if (submit_bio_wait(ret->bio)) + if (submit_bio_wait(sb->bio)) goto err; - memcpy(&layout, ret->sb, sizeof(layout)); + memcpy(&layout, sb->sb, sizeof(layout)); err = validate_sb_layout(&layout); if (err) goto err; - for (i = 0; i < layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout.sb_offset[i]); + for (i = layout.sb_offset; + i < layout.sb_offset + layout.nr_superblocks; i++) { + offset = le64_to_cpu(*i); - if (offset == BCH_SB_SECTOR) + if (offset == opt_get(*opts, sb)) continue; - err = read_one_super(ret, offset); + err = read_one_super(sb, offset); if (!err) goto got_super; } - goto err; -got_super: - pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", - le64_to_cpu(ret->sb->version), - le64_to_cpu(ret->sb->flags[0]), - le64_to_cpu(ret->sb->seq), - le32_to_cpu(ret->sb->u64s)); + ret = -EINVAL; + goto err; + +got_super: err = "Superblock block size smaller than device block size"; - if (le16_to_cpu(ret->sb->block_size) << 9 < - bdev_logical_block_size(ret->bdev)) + ret = -EINVAL; + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev)) goto err; - return NULL; + if (sb->mode & FMODE_WRITE) + bdev_get_queue(sb->bdev)->backing_dev_info->capabilities + |= BDI_CAP_STABLE_WRITES; + + return 0; err: - bch2_free_super(ret); - return err; + bch2_free_super(sb); + pr_err("error reading superblock: %s", err); + return ret; } /* write superblock: */ @@ -1108,13 +1099,20 @@ err: return ret; } -static inline int __bch2_check_mark_super(struct bch_fs *c, - struct bch_replicas_cpu_entry search, - unsigned max_dev) +int bch2_check_mark_super(struct bch_fs *c, + enum bch_data_type data_type, + struct bch_devs_list devs) { + struct bch_replicas_cpu_entry search; struct bch_replicas_cpu *r, *gc_r; + unsigned max_dev; bool marked; + if (!devs.nr) + return 0; + + devlist_to_replicas(devs, data_type, &search, &max_dev); + rcu_read_lock(); r = rcu_dereference(c->replicas); gc_r = rcu_dereference(c->replicas_gc); @@ -1126,32 +1124,6 @@ static inline int __bch2_check_mark_super(struct bch_fs *c, : bch2_check_mark_super_slowpath(c, search, max_dev); } -int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; - - if (!bkey_to_replicas(e, data_type, &search, &max_dev)) - return 0; - - return __bch2_check_mark_super(c, search, max_dev); -} - -int bch2_check_mark_super_devlist(struct bch_fs *c, - struct bch_devs_list *devs, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; - - if (!devs->nr) - return 0; - - devlist_to_replicas(*devs, data_type, &search, &max_dev); - return __bch2_check_mark_super(c, search, max_dev); -} - int bch2_replicas_gc_end(struct bch_fs *c, int err) { struct bch_replicas_cpu *new_r, *old_r; @@ -1435,12 +1407,19 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t /* Query replicas: */ -static bool __bch2_sb_has_replicas(struct bch_fs *c, - struct bch_replicas_cpu_entry search, - unsigned max_dev) +bool bch2_sb_has_replicas(struct bch_fs *c, + enum bch_data_type data_type, + struct bch_devs_list devs) { + struct bch_replicas_cpu_entry search; + unsigned max_dev; bool ret; + if (!devs.nr) + return true; + + devlist_to_replicas(devs, data_type, &search, &max_dev); + rcu_read_lock(); ret = replicas_has_entry(rcu_dereference(c->replicas), search, max_dev); @@ -1449,31 +1428,6 @@ static bool __bch2_sb_has_replicas(struct bch_fs *c, return ret; } -bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; - - if (!bkey_to_replicas(e, data_type, &search, &max_dev)) - return true; - - return __bch2_sb_has_replicas(c, search, max_dev); -} - -bool bch2_sb_has_replicas_devlist(struct bch_fs *c, struct bch_devs_list *devs, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; - - if (!devs->nr) - return true; - - devlist_to_replicas(*devs, data_type, &search, &max_dev); - return __bch2_sb_has_replicas(c, search, max_dev); -} - struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_devs_mask online_devs) { @@ -1579,12 +1533,23 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) goto out; for_each_cpu_replicas_entry(r, e) - if (replicas_test_dev(e, ca->dev_idx)) { + if (replicas_test_dev(e, ca->dev_idx)) ret |= 1 << e->data_type; - break; - } out: rcu_read_unlock(); return ret; } + +/* Quotas: */ + +static const char *bch2_sb_validate_quota(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + + if (vstruct_bytes(&q->field) != sizeof(*q)) + return "invalid field quota: wrong size"; + + return NULL; +} diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index e0dd26e3..59a8b816 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -94,8 +94,7 @@ int bch2_super_realloc(struct bch_sb_handle *, unsigned); const char *bch2_sb_validate(struct bch_sb_handle *); -const char *bch2_read_super(const char *, struct bch_opts, - struct bch_sb_handle *); +int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); void bch2_write_super(struct bch_fs *); /* BCH_SB_FIELD_journal: */ @@ -139,14 +138,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) /* BCH_SB_FIELD_replicas: */ -bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent, - enum bch_data_type); -bool bch2_sb_has_replicas_devlist(struct bch_fs *, struct bch_devs_list *, - enum bch_data_type); -int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent, - enum bch_data_type); -int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *, - enum bch_data_type); +bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type, + struct bch_devs_list); +int bch2_check_mark_super(struct bch_fs *, enum bch_data_type, + struct bch_devs_list); int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t); int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 69290d27..29ffba65 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -29,6 +29,7 @@ #include "move.h" #include "migrate.h" #include "movinggc.h" +#include "quota.h" #include "super.h" #include "super-io.h" #include "sysfs.h" @@ -214,14 +215,15 @@ static void __bch2_fs_read_only(struct bch_fs *c) */ bch2_journal_flush_all_pins(&c->journal); - if (!bch2_journal_error(&c->journal)) - bch2_btree_verify_flushed(c); - for_each_member_device(ca, c, i) bch2_dev_allocator_stop(ca); bch2_fs_journal_stop(&c->journal); + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags)) + bch2_btree_verify_flushed(c); + for_each_member_device(ca, c, i) bch2_dev_allocator_remove(c, ca); } @@ -366,6 +368,7 @@ err: static void bch2_fs_free(struct bch_fs *c) { + bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); bch2_fs_encryption_exit(c); bch2_fs_btree_cache_exit(c); @@ -380,7 +383,7 @@ static void bch2_fs_free(struct bch_fs *c) bioset_exit(&c->bio_write); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); - bioset_exit(&c->btree_read_bio); + bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); mempool_exit(&c->btree_reserve_pool); mempool_exit(&c->fill_iter); @@ -492,6 +495,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_allocator_init(c); bch2_fs_tiering_init(c); + bch2_fs_quota_init(c); INIT_LIST_HEAD(&c->list); @@ -561,8 +565,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update)) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_read_bio, 1, - offsetof(struct btree_read_bio, bio), + bioset_init(&c->btree_bio, 1, + max(offsetof(struct btree_read_bio, bio), + offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS) || @@ -671,13 +676,10 @@ static const char *__bch2_fs_start(struct bch_fs *c) struct bch_dev *ca; LIST_HEAD(journal); struct jset *j; - struct closure cl; time64_t now; unsigned i; int ret = -EINVAL; - closure_init_stack(&cl); - mutex_lock(&c->state_lock); BUG_ON(c->state != BCH_FS_STARTING); @@ -705,14 +707,14 @@ static const char *__bch2_fs_start(struct bch_fs *c) unsigned level; struct bkey_i *k; - err = "missing btree root"; k = bch2_journal_find_btree_root(c, j, i, &level); - if (!k && i < BTREE_ID_ALLOC) - goto err; - if (!k) continue; + err = "invalid btree root pointer"; + if (IS_ERR(k)) + goto err; + err = "error reading btree root"; if (bch2_btree_root_read(c, i, k, level)) { if (i != BTREE_ID_ALLOC) @@ -722,6 +724,10 @@ static const char *__bch2_fs_start(struct bch_fs *c) } } + for (i = 0; i < BTREE_ID_NR; i++) + if (!c->btree_roots[i].b) + bch2_btree_root_alloc(c, i); + err = "error reading allocation information"; ret = bch2_alloc_read(c, &journal); if (ret) @@ -739,14 +745,6 @@ static const char *__bch2_fs_start(struct bch_fs *c) if (c->opts.noreplay) goto recovery_done; - err = "cannot allocate new btree root"; - for (i = 0; i < BTREE_ID_NR; i++) - if (!c->btree_roots[i].b && - bch2_btree_root_alloc(c, i, &cl)) - goto err; - - closure_sync(&cl); - /* * bch2_journal_start() can't happen sooner, or btree_gc_finish() * will give spurious errors about oldest_gen > bucket_gen - @@ -754,12 +752,9 @@ static const char *__bch2_fs_start(struct bch_fs *c) */ bch2_journal_start(c); - err = "error starting allocator thread"; - for_each_rw_member(ca, c, i) - if (bch2_dev_allocator_start(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } + err = "error starting allocator"; + if (bch2_fs_allocator_start(c)) + goto err; bch_verbose(c, "starting journal replay:"); err = "journal replay failed"; @@ -777,6 +772,14 @@ static const char *__bch2_fs_start(struct bch_fs *c) if (ret) goto err; bch_verbose(c, "fsck done"); + + if (c->opts.usrquota || c->opts.grpquota) { + bch_verbose(c, "reading quotas:"); + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + bch_verbose(c, "quotas done"); + } } else { struct bch_inode_unpacked inode; struct bkey_inode_buf packed_inode; @@ -784,6 +787,7 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch_notice(c, "initializing new filesystem"); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_BRAND_NEW_FS, &c->flags); ret = bch2_initial_gc(c, &journal); if (ret) @@ -791,15 +795,15 @@ static const char *__bch2_fs_start(struct bch_fs *c) err = "unable to allocate journal buckets"; for_each_rw_member(ca, c, i) - if (bch2_dev_journal_alloc(ca)) { + if (bch2_dev_journal_alloc(c, ca)) { percpu_ref_put(&ca->io_ref); goto err; } - err = "cannot allocate new btree root"; + clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags); + for (i = 0; i < BTREE_ID_NR; i++) - if (bch2_btree_root_alloc(c, i, &cl)) - goto err; + bch2_btree_root_alloc(c, i); /* * journal_res_get() will crash if called before this has @@ -808,15 +812,9 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch2_journal_start(c); bch2_journal_set_replay_done(&c->journal); - err = "error starting allocator thread"; - for_each_rw_member(ca, c, i) - if (bch2_dev_allocator_start(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - /* Wait for new btree roots to be written: */ - closure_sync(&cl); + err = "error starting allocator"; + if (bch2_fs_allocator_start(c)) + goto err; bch2_inode_init(c, &inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); @@ -830,6 +828,12 @@ static const char *__bch2_fs_start(struct bch_fs *c) NULL, NULL, NULL, 0)) goto err; + if (c->opts.usrquota || c->opts.grpquota) { + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + } + err = "error writing first journal entry"; if (bch2_journal_meta(&c->journal)) goto err; @@ -867,8 +871,6 @@ out: return err; err: fsck_err: - closure_sync(&cl); - switch (ret) { case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); @@ -1107,6 +1109,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) struct bch_dev *ca; int ret; + lockdep_assert_held(&c->state_lock); + if (le64_to_cpu(sb->sb->seq) > le64_to_cpu(c->disk_sb->seq)) bch2_sb_to_fs(c, sb->sb); @@ -1153,7 +1157,9 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) bdevname(ca->disk_sb.bdev, c->name); bdevname(ca->disk_sb.bdev, ca->name); + mutex_lock(&c->sb_lock); bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + mutex_unlock(&c->sb_lock); if (ca->mi.state == BCH_MEMBER_STATE_RW) bch2_dev_allocator_add(c, ca); @@ -1430,17 +1436,18 @@ err: /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { + struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; const char *err; struct bch_dev *ca = NULL; struct bch_sb_field_members *mi, *dev_mi; struct bch_member saved_mi; unsigned dev_idx, nr_devices, u64s; - int ret = -EINVAL; + int ret; - err = bch2_read_super(path, bch2_opts_empty(), &sb); - if (err) - return -EINVAL; + ret = bch2_read_super(path, &opts, &sb); + if (ret) + return ret; err = bch2_sb_validate(&sb); if (err) @@ -1479,14 +1486,14 @@ have_slot: sizeof(struct bch_member) * nr_devices) / sizeof(u64); err = "no space in superblock for member info"; - mi = bch2_fs_sb_resize_members(c, u64s); - if (!mi) - goto err_unlock; - dev_mi = bch2_sb_resize_members(&sb, u64s); if (!dev_mi) goto err_unlock; + mi = bch2_fs_sb_resize_members(c, u64s); + if (!mi) + goto err_unlock; + memcpy(dev_mi, mi, u64s * sizeof(u64)); dev_mi->members[dev_idx] = saved_mi; @@ -1499,30 +1506,30 @@ have_slot: c->disk_sb->nr_devices = nr_devices; c->sb.nr_devices = nr_devices; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + if (bch2_dev_alloc(c, dev_idx)) { err = "cannot allocate memory"; ret = -ENOMEM; - goto err_unlock; + goto err; } if (__bch2_dev_online(c, &sb)) { err = "bch2_dev_online() error"; ret = -ENOMEM; - goto err_unlock; + goto err; } - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - ca = bch_dev_locked(c, dev_idx); if (ca->mi.state == BCH_MEMBER_STATE_RW) { - err = "journal alloc failed"; - if (bch2_dev_journal_alloc(ca)) - goto err; - err = __bch2_dev_read_write(c, ca); if (err) goto err; + + err = "journal alloc failed"; + if (bch2_dev_journal_alloc(c, ca)) + goto err; } mutex_unlock(&c->state_lock); @@ -1540,16 +1547,20 @@ err: /* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, const char *path) { + struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; struct bch_dev *ca; unsigned dev_idx; const char *err; + int ret; mutex_lock(&c->state_lock); - err = bch2_read_super(path, bch2_opts_empty(), &sb); - if (err) - goto err; + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + mutex_unlock(&c->state_lock); + return ret; + } dev_idx = sb.sb->dev_idx; @@ -1557,13 +1568,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path) if (err) goto err; - mutex_lock(&c->sb_lock); if (__bch2_dev_online(c, &sb)) { err = "__bch2_dev_online() error"; - mutex_unlock(&c->sb_lock); goto err; } - mutex_unlock(&c->sb_lock); ca = bch_dev_locked(c, dev_idx); if (ca->mi.state == BCH_MEMBER_STATE_RW) { @@ -1585,6 +1593,12 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) { mutex_lock(&c->state_lock); + if (!bch2_dev_is_online(ca)) { + bch_err(ca, "Already offline"); + mutex_unlock(&c->state_lock); + return 0; + } + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { bch_err(ca, "Cannot offline required disk"); mutex_unlock(&c->state_lock); @@ -1617,9 +1631,19 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) goto err; } + ret = bch2_journal_flush_device(&c->journal, ca->dev_idx); + if (ret) { + bch_err(ca, "Migrate failed: error %i flushing journal", ret); + goto err; + } + data = bch2_dev_has_data(c, ca); if (data) { - bch_err(ca, "Migrate error: data still present (%x)", data); + char buf[100]; + + bch2_scnprint_flag_list(buf, sizeof(buf), + bch2_data_types, data); + bch_err(ca, "Migrate failed, still has data (%s)", buf); ret = -EINVAL; goto err; } @@ -1670,33 +1694,33 @@ err: /* Filesystem open: */ -const char *bch2_fs_open(char * const *devices, unsigned nr_devices, - struct bch_opts opts, struct bch_fs **ret) +struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_opts opts) { - const char *err; + struct bch_sb_handle *sb = NULL; struct bch_fs *c = NULL; - struct bch_sb_handle *sb; unsigned i, best_sb = 0; + const char *err; + int ret = -ENOMEM; if (!nr_devices) - return "need at least one device"; + return ERR_PTR(-EINVAL); if (!try_module_get(THIS_MODULE)) - return "module unloading"; + return ERR_PTR(-ENODEV); - err = "cannot allocate memory"; sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); if (!sb) goto err; for (i = 0; i < nr_devices; i++) { - err = bch2_read_super(devices[i], opts, &sb[i]); - if (err) + ret = bch2_read_super(devices[i], &opts, &sb[i]); + if (ret) goto err; err = bch2_sb_validate(&sb[i]); if (err) - goto err; + goto err_print; } for (i = 1; i < nr_devices; i++) @@ -1707,56 +1731,53 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices, for (i = 0; i < nr_devices; i++) { err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); if (err) - goto err; + goto err_print; } - err = "cannot allocate memory"; + ret = -ENOMEM; c = bch2_fs_alloc(sb[best_sb].sb, opts); if (!c) goto err; err = "bch2_dev_online() error"; - mutex_lock(&c->sb_lock); + mutex_lock(&c->state_lock); for (i = 0; i < nr_devices; i++) if (__bch2_dev_online(c, &sb[i])) { - mutex_unlock(&c->sb_lock); - goto err; + mutex_unlock(&c->state_lock); + goto err_print; } - mutex_unlock(&c->sb_lock); + mutex_unlock(&c->state_lock); err = "insufficient devices"; if (!bch2_fs_may_start(c)) - goto err; + goto err_print; if (!c->opts.nostart) { err = __bch2_fs_start(c); if (err) - goto err; + goto err_print; } err = bch2_fs_online(c); if (err) - goto err; + goto err_print; - if (ret) - *ret = c; - else - closure_put(&c->cl); - - err = NULL; -out: kfree(sb); module_put(THIS_MODULE); - if (err) - c = NULL; - return err; + return c; +err_print: + pr_err("bch_fs_open err opening %s: %s", + devices[0], err); + ret = -EINVAL; err: if (c) bch2_fs_stop(c); for (i = 0; i < nr_devices; i++) bch2_free_super(&sb[i]); - goto out; + kfree(sb); + module_put(THIS_MODULE); + return ERR_PTR(ret); } static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, @@ -1827,9 +1848,8 @@ const char *bch2_fs_open_incremental(const char *path) struct bch_opts opts = bch2_opts_empty(); const char *err; - err = bch2_read_super(path, opts, &sb); - if (err) - return err; + if (bch2_read_super(path, &opts, &sb)) + return "error reading superblock"; err = __bch2_fs_open_incremental(&sb, opts); bch2_free_super(&sb); diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 6f628830..a35ee3db 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -198,8 +198,7 @@ const char *bch2_fs_read_write(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); const char *bch2_fs_start(struct bch_fs *); -const char *bch2_fs_open(char * const *, unsigned, struct bch_opts, - struct bch_fs **); +struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); const char *bch2_fs_open_incremental(const char *path); #endif /* _BCACHEFS_SUPER_H */ diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index f5007864..6a581097 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -39,7 +39,8 @@ static int bch2_tiering_thread(void *arg) struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); struct io_clock *clock = &c->io_clock[WRITE]; struct bch_dev *ca; - u64 tier_capacity, available_sectors, keys_moved, sectors_moved; + struct bch_move_stats move_stats; + u64 tier_capacity, available_sectors; unsigned long last; unsigned i, nr_devices; @@ -91,8 +92,7 @@ static int bch2_tiering_thread(void *arg) 0, -1, tiering_pred, tier, - &keys_moved, - §ors_moved); + &move_stats); } return 0; diff --git a/linux/kthread.c b/linux/kthread.c index 0f4b5715..80a9ac9a 100644 --- a/linux/kthread.c +++ b/linux/kthread.c @@ -64,6 +64,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data), vsnprintf(p->comm, sizeof(p->comm), namefmt, args); va_end(args); + p->flags |= PF_KTHREAD; p->thread_fn = thread_fn; p->thread_data = thread_data; p->state = TASK_UNINTERRUPTIBLE; @@ -73,6 +74,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data), init_completion(&p->exited); pthread_create(&p->thread, NULL, kthread_start_fn, p); + pthread_setname_np(p->thread, p->comm); return p; }