diff --git a/.bcachefs_revision b/.bcachefs_revision index 93724d8b..44599a02 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -8e1519ccb62b76736d5b9ca97e58b41ed9a11274 +ca97ee357774427208e4c251bfaa5957ae7f8c2c diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 01b3d4ad..f78621d8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -70,6 +70,7 @@ static inline void submit_bio(struct bio *bio) } int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t); +int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsigned); #define bdev_get_queue(bdev) (&((bdev)->queue)) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index d1e2f979..ae184220 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -516,7 +516,6 @@ DEFINE_EVENT(bch_fs, gc_gens_end, DECLARE_EVENT_CLASS(bucket_alloc, TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - bool user, u64 bucket, u64 free, u64 avail, @@ -525,14 +524,13 @@ DECLARE_EVENT_CLASS(bucket_alloc, struct bucket_alloc_state *s, bool nonblocking, const char *err), - TP_ARGS(ca, alloc_reserve, user, bucket, free, avail, + TP_ARGS(ca, alloc_reserve, bucket, free, avail, copygc_wait_amount, copygc_waiting_for, s, nonblocking, err), TP_STRUCT__entry( - __field(dev_t, dev ) + __field(u8, dev ) __array(char, reserve, 16 ) - __field(bool, user ) __field(u64, bucket ) __field(u64, free ) __field(u64, avail ) @@ -548,9 +546,8 @@ DECLARE_EVENT_CLASS(bucket_alloc, ), TP_fast_assign( - __entry->dev = ca->dev; + __entry->dev = ca->dev_idx; strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->user = user; __entry->bucket = bucket; __entry->free = free; __entry->avail = avail; @@ -565,10 +562,9 @@ DECLARE_EVENT_CLASS(bucket_alloc, strscpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", - MAJOR(__entry->dev), MINOR(__entry->dev), + TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", __entry->reserve, - __entry->user, + __entry->dev, __entry->bucket, __entry->free, __entry->avail, @@ -585,7 +581,6 @@ DECLARE_EVENT_CLASS(bucket_alloc, DEFINE_EVENT(bucket_alloc, bucket_alloc, TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - bool user, u64 bucket, u64 free, u64 avail, @@ -594,14 +589,13 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc, struct bucket_alloc_state *s, bool nonblocking, const char *err), - TP_ARGS(ca, alloc_reserve, user, bucket, free, avail, + TP_ARGS(ca, alloc_reserve, bucket, free, avail, copygc_wait_amount, copygc_waiting_for, s, nonblocking, err) ); DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - bool user, u64 bucket, u64 free, u64 avail, @@ -610,7 +604,7 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, struct bucket_alloc_state *s, bool nonblocking, const char *err), - TP_ARGS(ca, alloc_reserve, user, bucket, free, avail, + TP_ARGS(ca, alloc_reserve, bucket, free, avail, copygc_wait_amount, copygc_waiting_for, s, nonblocking, err) ); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index b39a4533..5f4bb82c 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -2175,21 +2175,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) } mutex_unlock(&c->btree_reserve_cache_lock); - while (1) { - struct open_bucket *ob; + spin_lock(&c->freelist_lock); + i = 0; + while (i < c->open_buckets_partial_nr) { + struct open_bucket *ob = + c->open_buckets + c->open_buckets_partial[i]; - spin_lock(&c->freelist_lock); - if (!ca->open_buckets_partial_nr) { + if (ob->dev == ca->dev_idx) { + swap(c->open_buckets_partial[i], + c->open_buckets_partial[--c->open_buckets_partial_nr]); + ob->on_partial_list = false; spin_unlock(&c->freelist_lock); - break; + bch2_open_bucket_put(c, ob); + spin_lock(&c->freelist_lock); + } else { + i++; } - ob = c->open_buckets + - ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - ob->on_partial_list = false; - spin_unlock(&c->freelist_lock); - - bch2_open_bucket_put(c, ob); } + spin_unlock(&c->freelist_lock); bch2_ec_stop_dev(c, ca); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index affddf1f..023b62c5 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -154,26 +154,17 @@ static void open_bucket_free_unused(struct bch_fs *c, struct write_point *wp, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - bool may_realloc = wp->data_type == BCH_DATA_user; + BUG_ON(c->open_buckets_partial_nr >= + ARRAY_SIZE(c->open_buckets_partial)); - BUG_ON(ca->open_buckets_partial_nr > - ARRAY_SIZE(ca->open_buckets_partial)); + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + c->open_buckets_partial[c->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); - if (ca->open_buckets_partial_nr < - ARRAY_SIZE(ca->open_buckets_partial) && - may_realloc) { - spin_lock(&c->freelist_lock); - ob->on_partial_list = true; - ca->open_buckets_partial[ca->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); - closure_wake_up(&c->freelist_wait); - } else { - bch2_open_bucket_put(c, ob); - } + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); } /* _only_ for allocating the journal on a new device: */ @@ -259,7 +250,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ob->valid = true; ob->sectors_free = ca->mi.bucket_size; - ob->alloc_reserve = reserve; ob->dev = ca->dev_idx; ob->gen = a->gen; ob->bucket = bucket; @@ -386,32 +376,6 @@ err: return ob; } -static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve) -{ - struct open_bucket *ob; - int i; - - spin_lock(&c->freelist_lock); - - for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { - ob = c->open_buckets + ca->open_buckets_partial[i]; - - if (reserve <= ob->alloc_reserve) { - array_remove_item(ca->open_buckets_partial, - ca->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - ob->alloc_reserve = reserve; - spin_unlock(&c->freelist_lock); - return ob; - } - } - - spin_unlock(&c->freelist_lock); - return NULL; -} - /* * This path is for before the freespace btree is initialized: * @@ -535,7 +499,6 @@ again: static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, enum alloc_reserve reserve, - bool may_alloc_partial, struct closure *cl, struct bch_dev_usage *usage) { @@ -574,12 +537,6 @@ again: if (waiting) closure_wake_up(&c->freelist_wait); - - if (may_alloc_partial) { - ob = try_alloc_partial_bucket(c, ca, reserve); - if (ob) - return ob; - } alloc: ob = likely(freespace) ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl) @@ -599,7 +556,6 @@ err: if (!IS_ERR(ob)) trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve], - may_alloc_partial, ob->bucket, usage->d[BCH_DATA_free].buckets, avail, @@ -611,7 +567,6 @@ err: else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) trace_and_count(c, bucket_alloc_fail, ca, bch2_alloc_reserves[reserve], - may_alloc_partial, 0, usage->d[BCH_DATA_free].buckets, avail, @@ -626,7 +581,6 @@ err: struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum alloc_reserve reserve, - bool may_alloc_partial, struct closure *cl) { struct bch_dev_usage usage; @@ -634,7 +588,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bch2_trans_do(c, NULL, NULL, 0, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, - may_alloc_partial, cl, &usage))); + cl, &usage))); return ob; } @@ -691,12 +645,10 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, bch2_dev_stripe_increment_inlined(ca, stripe, &usage); } -#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) -#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) - -static void add_new_bucket(struct bch_fs *c, +static int add_new_bucket(struct bch_fs *c, struct open_buckets *ptrs, struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, unsigned flags, @@ -705,12 +657,19 @@ static void add_new_bucket(struct bch_fs *c, unsigned durability = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + BUG_ON(*nr_effective >= nr_replicas); + __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) - ? durability : 1; + *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); + + if (*nr_effective >= nr_replicas) + return 1; + if (ob->ec) + return 1; + return 0; } int bch2_bucket_alloc_set_trans(struct btree_trans *trans, @@ -720,8 +679,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, + enum bch_data_type data_type, enum alloc_reserve reserve, - unsigned flags, struct closure *cl) { struct bch_fs *c = trans->c; @@ -754,8 +713,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, reserve, - flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, reserve, cl, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); percpu_ref_put(&ca->ref); @@ -767,10 +725,11 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - add_new_bucket(c, ptrs, devs_may_alloc, - nr_effective, have_cache, flags, ob); + ob->data_type = data_type; - if (*nr_effective >= nr_replicas) { + if (add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, 0, ob)) { ret = 0; break; } @@ -792,7 +751,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, u16 target, - unsigned erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, @@ -805,9 +763,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct open_bucket *ob; struct bch_dev *ca; unsigned i, ec_idx; - - if (!erasure_code) - return 0; + int ret = 0; if (nr_replicas < 2) return 0; @@ -842,46 +798,187 @@ got_bucket: ob->ec_idx = ec_idx; ob->ec = h->s; - add_new_bucket(c, ptrs, devs_may_alloc, - nr_effective, have_cache, flags, ob); + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); atomic_inc(&h->s->pin); out_put_head: bch2_ec_stripe_head_put(c, h); - return 0; + return ret; } /* Sector allocator */ -static void get_buckets_from_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - unsigned flags, - bool need_ec) +static bool want_bucket(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + bool *have_cache, bool ec, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (!test_bit(ob->dev, devs_may_alloc->d)) + return false; + + if (ob->data_type != wp->data_type) + return false; + + if (!ca->mi.durability && + (wp->data_type != BCH_DATA_user || !*have_cache)) + return false; + + if (ec != (ob->ec != NULL)) + return false; + + return true; +} + +static int bucket_alloc_set_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool ec, unsigned flags) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; + int ret = 0; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - - if (*nr_effective < nr_replicas && - test_bit(ob->dev, devs_may_alloc->d) && - (ca->mi.durability || - (wp->data_type == BCH_DATA_user && !*have_cache)) && - (ob->ec || !need_ec)) { - add_new_bucket(c, ptrs, devs_may_alloc, - nr_effective, have_cache, - flags, ob); - } else { + if (!ret && want_bucket(c, wp, devs_may_alloc, + have_cache, ec, ob)) + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + else ob_push(c, &ptrs_skip, ob); - } } wp->ptrs = ptrs_skip; + + return ret; +} + +static int bucket_alloc_set_partial(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, bool ec, + enum alloc_reserve reserve, + unsigned flags) +{ + int i, ret = 0; + + if (!c->open_buckets_partial_nr) + return 0; + + spin_lock(&c->freelist_lock); + + for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { + struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; + + if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev_usage usage; + u64 avail; + + bch2_dev_usage_read_fast(ca, &usage); + avail = dev_buckets_free(ca, usage, reserve); + if (!avail) + continue; + + array_remove_item(c->open_buckets_partial, + c->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + if (ret) + break; + } + } + + spin_unlock(&c->freelist_lock); + return ret; +} + +static int __open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + bool erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *_cl) +{ + struct bch_fs *c = trans->c; + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; + unsigned i; + int ret; + + rcu_read_lock(); + devs = target_rw_devs(c, wp->data_type, target); + rcu_read_unlock(); + + /* Don't allocate from devices we already have pointers to: */ + for (i = 0; i < devs_have->nr; i++) + __clear_bit(devs_have->devs[i], devs.d); + + open_bucket_for_each(c, ptrs, ob, i) + __clear_bit(ob->dev, devs.d); + + if (erasure_code && ec_open_bucket(c, ptrs)) + return 0; + + ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, flags); + if (ret) + return ret; + + ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, reserve, flags); + if (ret) + return ret; + + if (erasure_code) { + ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, + target, + nr_replicas, nr_effective, + have_cache, flags, _cl); + } else { +retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ + ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + wp->data_type, reserve, cl); + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && + !cl && _cl) { + cl = _cl; + goto retry_blocking; + } + + } + + return ret; } static int open_bucket_add_buckets(struct btree_trans *trans, @@ -895,72 +992,29 @@ static int open_bucket_add_buckets(struct btree_trans *trans, bool *have_cache, enum alloc_reserve reserve, unsigned flags, - struct closure *_cl) + struct closure *cl) { - struct bch_fs *c = trans->c; - struct bch_devs_mask devs; - struct open_bucket *ob; - struct closure *cl = NULL; int ret; - unsigned i; - - rcu_read_lock(); - devs = target_rw_devs(c, wp->data_type, target); - rcu_read_unlock(); - - /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < devs_have->nr; i++) - __clear_bit(devs_have->devs[i], devs.d); - - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); if (erasure_code) { - if (!ec_open_bucket(c, ptrs)) { - get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, flags, true); - if (*nr_effective >= nr_replicas) - return 0; - } - - if (!ec_open_bucket(c, ptrs)) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, erasure_code, - nr_replicas, nr_effective, - have_cache, flags, _cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; - if (*nr_effective >= nr_replicas) - return 0; - } - } - - get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, flags, false); - if (*nr_effective >= nr_replicas) - return 0; - -retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, erasure_code, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && - !cl && _cl) { - cl = _cl; - goto retry_blocking; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_operation_blocked) || + bch2_err_matches(ret, BCH_ERR_freelist_empty) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + return ret; + if (*nr_effective >= nr_replicas) + return 0; } - return ret; + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, false, + nr_replicas, nr_effective, have_cache, + reserve, flags, cl); + return ret < 0 ? ret : 0; } void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, @@ -1159,14 +1213,10 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct open_bucket *ob; struct open_buckets ptrs; unsigned nr_effective, write_points_nr; - unsigned ob_flags = 0; bool have_cache; int ret; int i; - if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ob_flags |= BUCKET_ALLOC_USE_DURABILITY; - BUG_ON(!nr_replicas || !nr_replicas_required); retry: ptrs.nr = 0; @@ -1176,9 +1226,6 @@ retry: *wp_ret = wp = writepoint_find(trans, write_point.v); - if (wp->data_type == BCH_DATA_user) - ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; - /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; @@ -1188,13 +1235,13 @@ retry: target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, - ob_flags, cl); + flags, cl); } else { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, - ob_flags, NULL); + flags, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; @@ -1203,7 +1250,7 @@ retry: 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, - ob_flags, cl); + flags, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); @@ -1350,6 +1397,24 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) } } +void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned i; + + spin_lock(&c->freelist_lock); + for (i = 0; i < c->open_buckets_partial_nr; i++) { + struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; + + prt_printf(out, "%zu ref %u type %s ec %u %u:%llu:%u\n", + ob - c->open_buckets, + atomic_read(&ob->pin), + bch2_data_types[ob->data_type], + ob->ec != NULL, + ob->dev, ob->bucket, ob->gen); + } + spin_unlock(&c->freelist_lock); +} + static const char * const bch2_write_point_states[] = { #define x(n) #n, WRITE_POINT_STATES() diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index ba7a87af..e9b3b142 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -31,8 +31,7 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum alloc_reserve, bool, - struct closure *); + enum alloc_reserve, struct closure *); static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, struct open_bucket *ob) @@ -152,8 +151,9 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum alloc_reserve, - unsigned, struct closure *); + unsigned, unsigned *, bool *, + enum bch_data_type, enum alloc_reserve, + struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, @@ -221,6 +221,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 2e6f4806..0739bf92 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -51,10 +51,9 @@ struct open_bucket { * the block in the stripe this open_bucket corresponds to: */ u8 ec_idx; - enum bch_data_type data_type:8; + enum bch_data_type data_type:6; unsigned valid:1; unsigned on_partial_list:1; - unsigned alloc_reserve:3; u8 dev; u8 gen; diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index e001f419..a40c2612 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -932,11 +932,14 @@ static int check_one_backpointer(struct btree_trans *trans, struct bpos bucket, u64 *bp_offset, struct bbpos start, - struct bbpos end) + struct bbpos end, + struct bpos *last_flushed_pos) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bch_backpointer bp; struct bbpos pos; + struct bpos bp_pos; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; @@ -957,17 +960,31 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(!k.k, trans->c, + bp_pos = bucket_pos_to_bp(c, bucket, + max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); + + if (!k.k && !bpos_eq(*last_flushed_pos, bp_pos)) { + *last_flushed_pos = bp_pos; + pr_info("flushing at %llu:%llu", + last_flushed_pos->inode, + last_flushed_pos->offset); + + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (fsck_err_on(!k.k, c, "%s backpointer points to missing extent\n%s", *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree", (bch2_backpointer_to_text(&buf, &bp), buf.buf))) { ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp); if (ret == -ENOENT) - bch_err(trans->c, "backpointer at %llu not found", *bp_offset); + bch_err(c, "backpointer at %llu not found", *bp_offset); } - - bch2_trans_iter_exit(trans, &iter); +out: fsck_err: + bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } @@ -978,6 +995,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, { struct btree_iter iter; struct bkey_s_c k; + struct bpos last_flushed_pos = SPOS_MAX; int ret = 0; for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, @@ -987,7 +1005,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, while (!(ret = commit_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL, - check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) && + check_one_backpointer(trans, iter.pos, &bp_offset, + start, end, &last_flushed_pos))) && bp_offset < U64_MAX) bp_offset++; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 7f9c1087..3f88e7ea 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -516,9 +516,6 @@ struct bch_dev { unsigned nr_open_buckets; unsigned nr_btree_reserve; - open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_partial_nr; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; @@ -859,6 +856,9 @@ struct bch_fs { struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; + struct write_point btree_write_point; struct write_point rebalance_write_point; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index d5a9cfb0..c8b0cf5e 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2568,6 +2568,18 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(iter->trans) || + (k = bch2_btree_iter_peek_type(iter, iter->flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(iter->trans); + + return k; +} + /* new transactional stuff: */ #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 1225c4dd..448be089 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -596,6 +596,8 @@ static inline int btree_trans_too_many_iters(struct btree_trans *trans) return 0; } +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + static inline struct bkey_s_c __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, struct btree_iter *iter, unsigned flags) diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 6285532e..026c249a 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -64,6 +64,15 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; + + if (path->ref > 1) { + /* + * We can't clone a path that has write locks: if the path is + * shared, unlock before set_pos(), traverse(): + */ + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + } return 0; trans_commit: return bch2_trans_update(trans, iter, &wb->k, 0) ?: diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index de0575f6..e1467e11 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -98,8 +98,10 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - ptr->cached = true; + if (ptr->dev == dev) { + bch2_extent_ptr_set_cached(k, ptr); + return; + } } static int __bch2_data_update_index_update(struct btree_trans *trans, @@ -295,15 +297,7 @@ out: int bch2_data_update_index_update(struct bch_write_op *op) { - struct bch_fs *c = op->c; - struct btree_trans trans; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - ret = __bch2_data_update_index_update(&trans, op); - bch2_trans_exit(&trans); - - return ret; + return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op)); } void bch2_data_update_read_done(struct data_update *m, @@ -326,8 +320,9 @@ void bch2_data_update_exit(struct data_update *update) const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) { - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), 0); + if (c->opts.nocow_enabled) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), 0); percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); } @@ -487,23 +482,26 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - if (ctxt) { - move_ctxt_wait_event(ctxt, trans, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) || - !atomic_read(&ctxt->read_sectors)); + if (c->opts.nocow_enabled) { + if (ctxt) { + move_ctxt_wait_event(ctxt, trans, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) || + !atomic_read(&ctxt->read_sectors)); - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) { - ret = -BCH_ERR_nocow_lock_blocked; - goto err; + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto err; + } } + ptrs_locked |= (1U << i); } - ptrs_locked |= (1U << i); + i++; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 84d2a0c4..7bd68880 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -138,20 +138,28 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", s->algorithm, le16_to_cpu(s->sectors), - s->nr_blocks - s->nr_redundant, + nr_data, s->nr_redundant, s->csum_type, 1U << s->csum_granularity_bits); - for (i = 0; i < s->nr_blocks; i++) - prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev, - (u64) s->ptrs[i].offset, - stripe_blockcount_get(s, i)); + for (i = 0; i < s->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = s->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); + if (i < nr_data) + prt_printf(out, "#%u", stripe_blockcount_get(s, i)); + if (ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } } /* returns blocknr in stripe that we matched: */ @@ -442,15 +450,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, percpu_ref_put(&ca->io_ref); } -static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, + struct ec_stripe_buf *stripe) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, + bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -462,11 +469,15 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip } bkey_reassemble(&stripe->key.k_i, k); err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); return ret; } +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +{ + return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe)); +} + /* recovery read path: */ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) { @@ -865,25 +876,6 @@ err: return ret; } -static void extent_stripe_ptr_add(struct bkey_s_extent e, - struct ec_stripe_buf *s, - struct bch_extent_ptr *ptr, - unsigned block) -{ - struct bch_extent_stripe_ptr *dst = (void *) ptr; - union bch_extent_entry *end = extent_entry_last(e); - - memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); - e.k->u64s += sizeof(*dst) / sizeof(u64); - - *dst = (struct bch_extent_stripe_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, - .block = block, - .redundancy = s->key.v.nr_redundant, - .idx = s->key.k.p.offset, - }; -} - static int ec_stripe_update_extent(struct btree_trans *trans, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, @@ -895,6 +887,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; @@ -933,16 +926,27 @@ static int ec_stripe_update_extent(struct btree_trans *trans, dev = s->key.v.ptrs[block].dev; - n = bch2_bkey_make_mut(trans, k); + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); ret = PTR_ERR_OR_ZERO(n); if (ret) goto out; + bkey_reassemble(n, k); + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev); BUG_ON(!ec_ptr); - extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block); + stripe_ptr = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .redundancy = s->key.v.nr_redundant, + .idx = s->key.k.p.offset, + }; + + __extent_entry_insert(n, + (union bch_extent_entry *) ec_ptr, + (union bch_extent_entry *) &stripe_ptr); ret = bch2_trans_update(trans, &iter, n, 0); out: @@ -999,6 +1003,35 @@ err: return ret; } +static void zero_out_rest_of_ec_bucket(struct bch_fs *c, + struct ec_stripe_new *s, + unsigned block, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; + int ret; + + if (!bch2_dev_get_ioref(ca, WRITE)) { + s->err = -EROFS; + return; + } + + memset(s->new_stripe.data[block] + (offset << 9), + 0, + ob->sectors_free << 9); + + ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + ob->bucket * ca->mi.bucket_size + offset, + ob->sectors_free, + GFP_KERNEL, 0); + + percpu_ref_put(&ca->io_ref); + + if (ret) + s->err = ret; +} + /* * data buckets of new stripe all written: create the stripe */ @@ -1014,6 +1047,14 @@ static void ec_stripe_create(struct ec_stripe_new *s) closure_sync(&s->iodone); + for (i = 0; i < nr_data; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (ob->sectors_free) + zero_out_rest_of_ec_bucket(c, s, i, ob); + } + if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); @@ -1155,9 +1196,6 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) { struct ec_stripe_new *s = ob->ec; - if (ob->sectors_free) - s->err = -1; - ec_stripe_new_put(c, s); } @@ -1398,10 +1436,10 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ h->s->nr_parity, &nr_have_parity, &have_cache, + BCH_DATA_parity, h->copygc ? RESERVE_movinggc : RESERVE_none, - 0, cl); open_bucket_for_each(c, &buckets, ob, i) { @@ -1427,10 +1465,10 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ h->s->nr_data, &nr_have_data, &have_cache, + BCH_DATA_user, h->copygc ? RESERVE_movinggc : RESERVE_none, - 0, cl); open_bucket_for_each(c, &buckets, ob, i) { @@ -1486,8 +1524,9 @@ static s64 get_existing_stripe(struct bch_fs *c, return ret; } -static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head *h) +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) { + struct bch_fs *c = trans->c; unsigned i; s64 idx; int ret; @@ -1497,7 +1536,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, struct ec_stripe_head * return -BCH_ERR_ENOSPC_stripe_reuse; h->s->have_existing_stripe = true; - ret = get_stripe_key(c, idx, &h->s->existing_stripe); + ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); if (ret) { bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); return ret; @@ -1626,7 +1665,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto err; if (ret && needs_stripe_new) - ret = __bch2_ec_stripe_head_reuse(c, h); + ret = __bch2_ec_stripe_head_reuse(trans, h); if (ret) { bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret)); goto err; @@ -1771,6 +1810,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_ec_exit(struct bch_fs *c) { struct ec_stripe_head *h; + unsigned i; while (1) { mutex_lock(&c->ec_stripe_head_lock); @@ -1782,7 +1822,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) if (!h) break; - BUG_ON(h->s); + if (h->s) { + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) + BUG_ON(h->s->blocks[i]); + + kfree(h->s); + } kfree(h); } @@ -1801,6 +1846,8 @@ void bch2_fs_ec_init_early(struct bch_fs *c) int bch2_fs_ec_init(struct bch_fs *c) { + spin_lock_init(&c->ec_stripes_new_lock); + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index d01cec89..4fc581be 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -706,18 +706,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry k->k.u64s -= extent_entry_u64s(entry); } -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy_u64s_small(dst, new, extent_entry_u64s(new)); -} - void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct extent_ptr_decoded *p) { @@ -951,6 +939,29 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, return false; } +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; + + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } + + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; + } + + BUG(); +} + /* * bch_extent_normalize - clean up an extent, dropping stale pointers etc. * @@ -1094,7 +1105,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, unsigned size_ondisk = k.k->size; unsigned nonce = UINT_MAX; unsigned nr_ptrs = 0; - bool unwritten = false; + bool unwritten = false, have_ec = false, crc_since_last_ptr = false; int ret; if (bkey_is_btree_ptr(k.k)) @@ -1130,7 +1141,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } + if (entry->ptr.cached && have_ec) { + prt_printf(err, "cached, erasure coded ptr"); + return -BCH_ERR_invalid_bkey; + } + unwritten = entry->ptr.unwritten; + have_ec = false; + crc_since_last_ptr = false; nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: @@ -1164,17 +1182,43 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + crc_since_last_ptr = true; break; case BCH_EXTENT_ENTRY_stripe_ptr: + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + have_ec = true; break; } } + if (!nr_ptrs) { + prt_str(err, "no ptrs"); + return -BCH_ERR_invalid_bkey; + } + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { prt_str(err, "too many ptrs"); return -BCH_ERR_invalid_bkey; } + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + return 0; } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 1d8f3b30..2e37543a 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -76,6 +76,18 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) return extent_entry_bytes(entry) / sizeof(u64); } +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy_u64s_small(dst, new, extent_entry_u64s(new)); +} + static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; @@ -655,6 +667,8 @@ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c); +void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); + bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 64925db2..15ce0657 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1650,7 +1650,7 @@ static void __bch2_write(struct bch_write_op *op) nofs_flags = memalloc_nofs_save(); - if (unlikely(op->opts.nocow)) { + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); if (op->flags & BCH_WRITE_DONE) goto out_nofs_restore; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 957eeece..e0c4f51a 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -789,8 +789,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; } } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, - false, cl); + ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, cl); ret = PTR_ERR_OR_ZERO(ob[nr_got]); if (ret) break; diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c index 29e51bde..cf5998e5 100644 --- a/libbcachefs/keylist.c +++ b/libbcachefs/keylist.c @@ -31,22 +31,6 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, return 0; } -void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) -{ - struct bkey_i *where; - - for_each_keylist_key(l, where) - if (bpos_lt(insert->k.p, where->k.p)) - break; - - memmove_u64s_up((u64 *) where + insert->k.u64s, - where, - ((u64 *) l->top) - ((u64 *) where)); - - l->top_p += insert->k.u64s; - bkey_copy(where, insert); -} - void bch2_keylist_pop_front(struct keylist *l) { l->top_p -= bch2_keylist_front(l)->k.u64s; diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index 635efb7e..fe759c70 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -5,7 +5,6 @@ #include "keylist_types.h" int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); void bch2_keylist_pop_front(struct keylist *); static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index 9eec12a9..e913b90f 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "btree_iter.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -101,7 +102,8 @@ static const char * const bch2_lru_types[] = { static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, - struct bkey_s_c lru_k) + struct bkey_s_c lru_k, + struct bpos *last_flushed_pos) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -137,19 +139,25 @@ static int bch2_check_lru_key(struct btree_trans *trans, break; } - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set || - lru_pos_time(lru_k.k->p) != idx, c, - "incorrect lru entry: lru %s time %llu\n" - " %s\n" - " for %s", - bch2_lru_types[type], - lru_pos_time(lru_k.k->p), - (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { - ret = bch2_btree_delete_at(trans, lru_iter, 0); - if (ret) - goto err; + if (lru_k.k->type != KEY_TYPE_set || + lru_pos_time(lru_k.k->p) != idx) { + if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { + *last_flushed_pos = lru_k.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (fsck_err(c, "incorrect lru entry: lru %s time %llu\n" + " %s\n" + " for %s", + bch2_lru_types[type], + lru_pos_time(lru_k.k->p), + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) + ret = bch2_btree_delete_at(trans, lru_iter, 0); } +out: err: fsck_err: bch2_trans_iter_exit(trans, &iter); @@ -163,6 +171,7 @@ int bch2_check_lrus(struct bch_fs *c) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + struct bpos last_flushed_pos = POS_MIN; int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -170,7 +179,7 @@ int bch2_check_lrus(struct bch_fs *c) ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_lru_key(&trans, &iter, k)); + bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)); bch2_trans_exit(&trans); return ret; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 7dac9264..4ef7595f 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -227,7 +227,8 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, if (bkey_deleted(&n->k)) n->k.size = 0; - return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 304718a0..76c2691a 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -404,6 +404,12 @@ enum opt_type { NULL, "Nocow mode: Writes will be done in place when possible.\n"\ "Snapshots and reflink will still caused writes to be COW\n"\ "Implicitly disables data checksumming, compression and encryption")\ + x(nocow_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable nocow mode: enables runtime locking in\n"\ + "data move path needed if nocow will ever be in use\n")\ x(no_data_io, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index b981c87e..a7582dd4 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -194,6 +194,7 @@ read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); read_attribute(open_buckets); +read_attribute(open_buckets_partial); read_attribute(write_points); read_attribute(nocow_lock_table); @@ -455,6 +456,9 @@ SHOW(bch2_fs) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c); + if (attr == &sysfs_open_buckets_partial) + bch2_open_buckets_partial_to_text(out, c); + if (attr == &sysfs_write_points) bch2_write_points_to_text(out, c); @@ -663,6 +667,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, + &sysfs_open_buckets_partial, &sysfs_write_points, #ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, diff --git a/linux/blkdev.c b/linux/blkdev.c index 0a5cedfe..805d55db 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -118,6 +118,14 @@ int blkdev_issue_discard(struct block_device *bdev, return 0; } +int blkdev_issue_zeroout(struct block_device *bdev, + sector_t sector, sector_t nr_sects, + gfp_t gfp_mask, unsigned flags) +{ + /* Not yet implemented: */ + BUG(); +} + unsigned bdev_logical_block_size(struct block_device *bdev) { struct stat statbuf;