#include "bcachefs.h" #include "alloc.h" #include "btree_iter.h" #include "buckets.h" #include "clock.h" #include "extents.h" #include "io.h" #include "keylist.h" #include "move.h" #include "super-io.h" #include "tier.h" #include #include #include struct tiering_state { struct bch_tier *tier; unsigned sectors; unsigned stripe_size; unsigned dev_idx; struct bch_dev *ca; }; static bool tiering_pred(struct bch_fs *c, struct tiering_state *s, struct bkey_s_c k) { if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; unsigned replicas = 0; /* Make sure we have room to add a new pointer: */ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > BKEY_EXTENT_VAL_U64s_MAX) return false; extent_for_each_ptr(e, ptr) if (c->devs[ptr->dev]->mi.tier >= s->tier->idx) replicas++; return replicas < c->opts.data_replicas; } return false; } static void tier_put_device(struct tiering_state *s) { if (s->ca) percpu_ref_put(&s->ca->io_ref); s->ca = NULL; } /** * refill_next - move on to refilling the next cache's tiering keylist */ static void tier_next_device(struct bch_fs *c, struct tiering_state *s) { if (!s->ca || s->sectors > s->stripe_size) { tier_put_device(s); s->sectors = 0; s->dev_idx++; spin_lock(&s->tier->devs.lock); if (s->dev_idx >= s->tier->devs.nr) s->dev_idx = 0; if (s->tier->devs.nr) { s->ca = s->tier->devs.d[s->dev_idx].dev; percpu_ref_get(&s->ca->io_ref); } spin_unlock(&s->tier->devs.lock); } } static int issue_tiering_move(struct bch_fs *c, struct tiering_state *s, struct moving_context *ctxt, struct bkey_s_c k) { int ret; ret = bch2_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL); if (!ret) { trace_tiering_copy(k.k); s->sectors += k.k->size; } else { trace_tiering_alloc_fail(c, k.k->size); } return ret; } /** * tiering_next_cache - issue a move to write an extent to the next cache * device in round robin order */ static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) { struct moving_context ctxt; struct tiering_state s; struct btree_iter iter; struct bkey_s_c k; unsigned nr_devices = READ_ONCE(tier->devs.nr); int ret; if (!nr_devices) return 0; trace_tiering_start(c); memset(&s, 0, sizeof(s)); s.tier = tier; s.stripe_size = 2048; /* 1 mb for now */ bch2_move_ctxt_init(&ctxt, &tier->pd.rate, nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH); while (!kthread_should_stop() && !bch2_move_ctxt_wait(&ctxt) && (k = bch2_btree_iter_peek(&iter)).k && !btree_iter_err(k)) { if (!tiering_pred(c, &s, k)) goto next; tier_next_device(c, &s); if (!s.ca) break; ret = issue_tiering_move(c, &s, &ctxt, k); if (ret) { bch2_btree_iter_unlock(&iter); /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(&ctxt); continue; } next: bch2_btree_iter_advance_pos(&iter); //bch2_btree_iter_cond_resched(&iter); /* unlock before calling moving_context_wait() */ bch2_btree_iter_unlock(&iter); cond_resched(); } bch2_btree_iter_unlock(&iter); tier_put_device(&s); bch2_move_ctxt_exit(&ctxt); trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); return ctxt.sectors_moved; } static int bch2_tiering_thread(void *arg) { struct bch_tier *tier = arg; struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); struct io_clock *clock = &c->io_clock[WRITE]; struct bch_dev *ca; u64 tier_capacity, available_sectors; unsigned long last; unsigned i; set_freezable(); while (!kthread_should_stop()) { if (kthread_wait_freezable(c->tiering_enabled && tier->devs.nr)) break; while (1) { struct bch_tier *faster_tier; last = atomic_long_read(&clock->now); tier_capacity = available_sectors = 0; for (faster_tier = c->tiers; faster_tier != tier; faster_tier++) { spin_lock(&faster_tier->devs.lock); group_for_each_dev(ca, &faster_tier->devs, i) { tier_capacity += (ca->mi.nbuckets - ca->mi.first_bucket) << ca->bucket_bits; available_sectors += dev_buckets_available(ca) << ca->bucket_bits; } spin_unlock(&faster_tier->devs.lock); } if (available_sectors < (tier_capacity >> 1)) break; bch2_kthread_io_clock_wait(clock, last + available_sectors - (tier_capacity >> 1)); if (kthread_should_stop()) return 0; } read_tiering(c, tier); } return 0; } static void __bch2_tiering_stop(struct bch_tier *tier) { tier->pd.rate.rate = UINT_MAX; bch2_ratelimit_reset(&tier->pd.rate); if (tier->migrate) kthread_stop(tier->migrate); tier->migrate = NULL; } void bch2_tiering_stop(struct bch_fs *c) { struct bch_tier *tier; for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) __bch2_tiering_stop(tier); } static int __bch2_tiering_start(struct bch_tier *tier) { if (!tier->migrate) { struct task_struct *p = kthread_create(bch2_tiering_thread, tier, "bch_tier[%u]", tier->idx); if (IS_ERR(p)) return PTR_ERR(p); tier->migrate = p; } wake_up_process(tier->migrate); return 0; } int bch2_tiering_start(struct bch_fs *c) { struct bch_tier *tier; bool have_faster_tier = false; if (c->opts.nochanges) return 0; for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { if (!tier->devs.nr) continue; if (have_faster_tier) { int ret = __bch2_tiering_start(tier); if (ret) return ret; } else { __bch2_tiering_stop(tier); } have_faster_tier = true; } return 0; } void bch2_fs_tiering_init(struct bch_fs *c) { unsigned i; for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { c->tiers[i].idx = i; bch2_pd_controller_init(&c->tiers[i].pd); } }