Update bcachefs sources to ec2ddb95112b bcachefs: bch2_opts_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-09-07 21:01:09 -04:00
parent f9ec00d5ca
commit cd35891eb9
59 changed files with 1295 additions and 822 deletions

View File

@ -1 +1 @@
22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d
ec2ddb95112b8967753591b16e2e439eee76c5b1

View File

@ -65,6 +65,8 @@
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */

7
include/linux/swap.h Normal file
View File

@ -0,0 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAP_H
#define _LINUX_SWAP_H
static inline void mm_account_reclaimed_pages(unsigned long pages) {}
#endif /* _LINUX_SWAP_H */

View File

@ -44,6 +44,20 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
return t;
}
static inline void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
while (nsec >= NSEC_PER_SEC) {
nsec -= NSEC_PER_SEC;
++sec;
}
while (nsec < 0) {
nsec += NSEC_PER_SEC;
--sec;
}
ts->tv_sec = sec;
ts->tv_nsec = nsec;
}
#define ns_to_timespec64 ns_to_timespec
#define timespec64_to_ns timespec_to_ns
#define timespec64_trunc timespec_trunc

View File

@ -37,6 +37,8 @@ typedef unsigned gfp_t;
#define __GFP_NOWARN 0
#define __GFP_NORETRY 0
#define __GFP_NOFAIL 0
#define __GFP_ACCOUNT 0
#define __GFP_RECLAIMABLE 0
#define __GFP_ZERO 1
#define GFP_KERNEL 2

View File

@ -137,7 +137,7 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
return NULL;
acl = allocate_dropping_locks(trans, ret,
posix_acl_alloc(count, _gfp));
posix_acl_alloc(count, GFP_KERNEL));
if (!acl)
return ERR_PTR(-ENOMEM);
if (ret) {
@ -427,7 +427,8 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
if (ret)
goto err;
ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
ret = allocate_dropping_locks_errcode(trans,
__posix_acl_chmod(&acl, GFP_KERNEL, mode));
if (ret)
goto err;

View File

@ -1969,8 +1969,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
percpu_ref_put(&ca->io_ref);
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
@ -1980,18 +1980,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (discard_in_flight_add(ca, bucket, false))
return;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
return;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
goto put_ioref;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
goto put_ref;
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
put_ioref:
percpu_ref_put(&ca->io_ref);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@ -2133,26 +2133,26 @@ static void bch2_do_invalidates_work(struct work_struct *work)
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
percpu_ref_put(&ca->io_ref);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_dev_do_invalidates(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
return;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
goto put_ioref;
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
goto put_ref;
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
put_ioref:
percpu_ref_put(&ca->io_ref);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
@ -2298,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c)
return 0;
}
/* device removal */
int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
struct bpos start = POS(ca->dev_idx, 0);
struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
/*
* We clear the LRU and need_discard btrees first so that we don't race
* with bch2_do_invalidates() and bch2_do_discards()
*/
ret = bch2_dev_remove_stripes(c, ca) ?:
bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_dev_usage_remove(c, ca->dev_idx);
bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
/* Bucket IO clocks: */
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@ -2433,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
clear_bit(ca->dev_idx, c->rw_devs[i].d);
c->rw_devs_change_count++;
/*
* Capacity is calculated based off of devices in allocation groups:
*/
@ -2468,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* device goes rw: */
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
lockdep_assert_held(&c->state_lock);
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
if (ca->mi.data_allowed & (1 << i))
set_bit(ca->dev_idx, c->rw_devs[i].d);
c->rw_devs_change_count++;
}
void bch2_dev_allocator_background_exit(struct bch_dev *ca)

View File

@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);

View File

@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
enum bch_watermark watermark,
enum bch_data_type data_type,
struct closure *cl,
bool nowait,
struct bch_dev_usage *usage)
{
struct bch_fs *c = trans->c;
@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bucket_alloc_state s = {
.btree_bitmap = data_type == BCH_DATA_btree,
};
bool waiting = false;
bool waiting = nowait;
again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
data_type, cl, &usage)));
data_type, cl, false, &usage)));
return ob;
}
@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
unsigned flags,
struct open_bucket *ob)
{
unsigned durability = ob_dev(c, ob)->mi.durability;
@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
unsigned flags,
enum bch_write_flags flags,
enum bch_data_type data_type,
enum bch_watermark watermark,
struct closure *cl)
@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
bch2_dev_put(ca);
@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, flags, ob)) {
have_cache, ob)) {
ret = 0;
break;
}
@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
unsigned flags,
enum bch_write_flags flags,
struct closure *cl)
{
struct bch_fs *c = trans->c;
@ -883,7 +884,7 @@ got_bucket:
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, flags, ob);
have_cache, ob);
out_put_head:
bch2_ec_stripe_head_put(c, h);
return ret;
@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
bool ec, unsigned flags)
bool ec)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
have_cache, ec, ob))
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, flags, ob);
have_cache, ob);
else
ob_push(c, &ptrs_skip, ob);
}
@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache, bool ec,
enum bch_watermark watermark,
unsigned flags)
enum bch_watermark watermark)
{
int i, ret = 0;
@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, flags, ob);
have_cache, ob);
if (ret)
break;
}
@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
unsigned flags,
enum bch_write_flags flags,
struct closure *_cl)
{
struct bch_fs *c = trans->c;
@ -1024,13 +1024,13 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
have_cache, erasure_code, flags);
have_cache, erasure_code);
if (ret)
return ret;
ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
have_cache, erasure_code, watermark, flags);
have_cache, erasure_code, watermark);
if (ret)
return ret;
@ -1071,7 +1071,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
unsigned flags,
enum bch_write_flags flags,
struct closure *cl)
{
int ret;
@ -1373,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum bch_watermark watermark,
unsigned flags,
enum bch_write_flags flags,
struct closure *cl,
struct write_point **wp_ret)
{
@ -1389,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
@ -1495,11 +1493,12 @@ err:
try_decrease_writepoints(trans, write_points_nr))
goto retry;
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
ret = -BCH_ERR_bucket_alloc_blocked;
if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
return cl
? -BCH_ERR_bucket_alloc_blocked
: -BCH_ERR_ENOSPC_bucket_alloc;
ret = -BCH_ERR_bucket_alloc_blocked;
return ret;
}
@ -1730,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 12);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
bch2_dev_usage_to_text(out, ca, &stats);
prt_newline(out);

View File

@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
return ret;
}
enum bch_write_flags;
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, unsigned,
unsigned, unsigned *, bool *, enum bch_write_flags,
enum bch_data_type, enum bch_watermark,
struct closure *);
@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct bch_devs_list *,
unsigned, unsigned,
enum bch_watermark,
unsigned,
enum bch_write_flags,
struct closure *,
struct write_point **);

View File

@ -3,12 +3,14 @@
#include "bbpos.h"
#include "alloc_background.h"
#include "backpointers.h"
#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "checksum.h"
#include "disk_accounting.h"
#include "error.h"
#include <linux/mm.h>
@ -750,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
bch2_btree_cache_unpin(c);
btree_interior_mask |= btree_leaf_mask;
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
@ -775,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
bch2_node_pin(c, b);
0;
}));
}
@ -782,12 +787,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
struct progress_indicator_state {
unsigned long next_print;
u64 nodes_seen;
u64 nodes_total;
struct btree *last_node;
};
static inline void progress_init(struct progress_indicator_state *s,
struct bch_fs *c,
u64 btree_id_mask)
{
memset(s, 0, sizeof(*s));
s->next_print = jiffies + HZ * 10;
for (unsigned i = 0; i < BTREE_ID_NR; i++) {
if (!(btree_id_mask & BIT_ULL(i)))
continue;
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_btree,
.btree.id = i,
};
u64 v;
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
s->nodes_total += div64_ul(v, btree_sectors(c));
}
}
static inline bool progress_update_p(struct progress_indicator_state *s)
{
bool ret = time_after_eq(jiffies, s->next_print);
if (ret)
s->next_print = jiffies + HZ * 10;
return ret;
}
static void progress_update_iter(struct btree_trans *trans,
struct progress_indicator_state *s,
struct btree_iter *iter,
const char *msg)
{
struct bch_fs *c = trans->c;
struct btree *b = path_l(btree_iter_path(trans, iter))->b;
s->nodes_seen += b != s->last_node;
s->last_node = b;
if (progress_update_p(s)) {
struct printbuf buf = PRINTBUF;
unsigned percent = s->nodes_total
? div64_u64(s->nodes_seen * 100, s->nodes_total)
: 0;
prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
msg, percent, s->nodes_seen, s->nodes_total);
bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
}
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
struct progress_indicator_state progress;
int ret = 0;
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
@ -805,6 +878,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@ -865,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
@ -920,19 +993,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
struct bch_fs *c = trans->c;
struct bkey_buf last_flushed;
struct progress_indicator_state progress;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed));
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, trans->c);
bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
@ -977,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;

View File

@ -542,7 +542,7 @@ struct bch_dev {
* gc_gens_lock, for device resize - holding any is sufficient for
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
@ -871,6 +871,7 @@ struct bch_fs {
/* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
unsigned long rw_devs_change_count;
u64 capacity; /* sectors */
u64 reserved; /* sectors */
@ -1045,8 +1046,6 @@ struct bch_fs {
* for signaling to the toplevel code which pass we want to run now.
*/
enum bch_recovery_pass curr_recovery_pass;
/* bitmap of explicitly enabled recovery passes: */
u64 recovery_passes_explicit;
/* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
/* never rewinds version of curr_recovery_pass */
@ -1195,12 +1194,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
s64 sec;
s32 rem;
time += c->sb.time_base_lo;
t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
t.tv_nsec = rem * c->sb.nsec_per_time_unit;
sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
return t;
}

View File

@ -15,11 +15,12 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
#include <linux/swap.h>
#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
do { \
if (shrinker_counter) \
bc->not_freed_##counter++; \
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
} while (0)
const char * const bch2_btree_node_flags[] = {
@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = {
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
unsigned i, reserve = 16;
unsigned reserve = 16;
if (!c->btree_roots_known[0].b)
reserve += 8;
for (i = 0; i < btree_id_nr_alive(c); i++) {
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
}
c->btree_cache.reserve = reserve;
c->btree_cache.nr_reserve = reserve;
}
static inline unsigned btree_cache_can_free(struct btree_cache *bc)
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
return max_t(int, 0, bc->used - bc->reserve);
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
size_t can_free = list->nr;
if (!list->idx)
can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
BUG_ON(btree_node_hashed(b));
/*
* This should really be done in slub/vmalloc, but we're using the
* kmalloc_large() path, so we're working around a slub bug by doing
* this here:
*/
if (b->data)
mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
if (b->aux_data)
mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
EBUG_ON(btree_node_write_in_flight(b));
clear_btree_node_just_written(b);
@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
#endif
b->aux_data = NULL;
bc->used--;
bc->nr_freeable--;
btree_node_to_freedlist(bc, b);
}
@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
bch2_btree_lock_init(&b->c, 0);
bc->used++;
bc->nr_freeable++;
list_add(&b->list, &bc->freeable);
return b;
}
@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
{
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
u64 mask = bc->pinned_nodes_mask[!!b->c.level];
return ((mask & BIT_ULL(b->c.btree_id)) &&
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
}
void bch2_node_pin(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
mutex_lock(&bc->lock);
BUG_ON(!__btree_node_pinned(bc, b));
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
set_btree_node_pinned(b);
list_move(&b->list, &bc->live[1].list);
bc->live[0].nr--;
bc->live[1].nr++;
}
mutex_unlock(&bc->lock);
}
void bch2_btree_cache_unpin(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *n;
mutex_lock(&bc->lock);
c->btree_cache.pinned_nodes_mask[0] = 0;
c->btree_cache.pinned_nodes_mask[1] = 0;
list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
clear_btree_node_pinned(b);
list_move(&b->list, &bc->live[0].list);
bc->live[0].nr++;
bc->live[1].nr--;
}
mutex_unlock(&bc->lock);
}
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
lockdep_assert_held(&bc->lock);
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
b->hash_val = 0;
if (b->c.btree_id < BTREE_ID_NR)
--bc->used_by_btree[b->c.btree_id];
--bc->nr_by_btree[b->c.btree_id];
bc->live[btree_node_pinned(b)].nr--;
bc->nr_freeable++;
list_move(&b->list, &bc->freeable);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
if (!ret && b->c.btree_id < BTREE_ID_NR)
bc->used_by_btree[b->c.btree_id]++;
return ret;
if (ret)
return ret;
if (b->c.btree_id < BTREE_ID_NR)
bc->nr_by_btree[b->c.btree_id]++;
bool p = __btree_node_pinned(bc, b);
mod_bit(BTREE_NODE_pinned, &b->flags, p);
list_move_tail(&b->list, &bc->live[p].list);
bc->live[p].nr++;
bc->nr_freeable--;
return 0;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
int ret;
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
list_add_tail(&b->list, &bc->live);
int ret = __bch2_btree_node_hash_insert(bc, b);
mutex_unlock(&bc->lock);
return ret;
@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
int ret = 0;
lockdep_assert_held(&bc->lock);
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
u64 mask = b->c.level
? bc->pinned_nodes_interior_mask
: bc->pinned_nodes_leaf_mask;
if ((mask & BIT_ULL(b->c.btree_id)) &&
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
return -BCH_ERR_ENOMEM_btree_node_reclaim;
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree_cache_list *list = shrink->private_data;
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
bool trigger_writes = atomic_read(&bc->dirty) + nr >=
bc->used * 3 / 4;
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
can_free = btree_cache_can_free(bc);
can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
bc->freed++;
bc->nr_freed++;
}
}
restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
bc->not_freed_access_bit++;
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
--touched;;
} else if (!btree_node_reclaim(c, b, true)) {
bch2_btree_node_hash_remove(bc, b);
freed++;
btree_node_data_free(c, b);
bc->freed++;
bc->nr_freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@ -450,7 +517,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
list_move(&bc->live, &b->list);
list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@ -464,8 +531,8 @@ restart:
break;
}
out_rotate:
if (&t->list != &bc->live)
list_move_tail(&bc->live, &t->list);
if (&t->list != &list->list)
list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@ -478,44 +545,45 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
return btree_cache_can_free(bc);
return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
unsigned i, flags;
struct btree *b, *t;
unsigned long flags;
shrinker_free(bc->shrink);
shrinker_free(bc->live[1].shrink);
shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
list_move(&c->verify_data->list, &bc->live[0].list);
kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
list_add(&r->b->list, &bc->live);
list_add(&r->b->list, &bc->live[0].list);
}
list_splice(&bc->freeable, &bc->live);
while (!list_empty(&bc->live)) {
b = list_first_entry(&bc->live, struct btree, list);
list_for_each_entry_safe(b, t, &bc->live[1].list, list)
bch2_btree_node_hash_remove(bc, b);
list_for_each_entry_safe(b, t, &bc->live[0].list, list)
bch2_btree_node_hash_remove(bc, b);
list_for_each_entry_safe(b, t, &bc->freeable, list) {
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
}
BUG_ON(!bch2_journal_error(&c->journal) &&
atomic_read(&c->btree_cache.dirty));
atomic_long_read(&c->btree_cache.nr_dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
while (!list_empty(&bc->freed_nonpcpu)) {
b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
list_del(&b->list);
six_lock_exit(&b->c.lock);
kfree(b);
@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
BUG_ON(bc->nr_by_btree[i]);
BUG_ON(bc->live[0].nr);
BUG_ON(bc->live[1].nr);
BUG_ON(bc->nr_freeable);
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->reserve; i++)
for (i = 0; i < bc->nr_reserve; i++)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
list_splice_init(&bc->live, &bc->freeable);
list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
bc->shrink = shrink;
bc->live[0].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
shrink->seeks = 4;
shrink->private_data = c;
shrink->seeks = 2;
shrink->private_data = &bc->live[0];
shrinker_register(shrink);
shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
if (!shrink)
goto err;
bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
shrink->seeks = 8;
shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@ -582,7 +665,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
bc->live[i].idx = i;
INIT_LIST_HEAD(&bc->live[i].list);
}
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_reclaim(c, b, false))
return b;
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list)
if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_write_and_reclaim(c, b))
return b;
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list)
if (!btree_node_write_and_reclaim(c, b))
return b;
/*
* Rare case: all nodes were intent-locked.
@ -716,14 +804,15 @@ got_node:
mutex_unlock(&bc->lock);
if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
if (memalloc_flags_do(PF_MEMALLOC_NORECLAIM,
btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))) {
bch2_trans_unlock(trans);
if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
goto err;
}
mutex_lock(&bc->lock);
bc->used++;
bc->nr_freeable++;
got_mem:
mutex_unlock(&bc->lock);
@ -1264,8 +1353,8 @@ wait_on_io:
BUG_ON(btree_node_dirty(b));
mutex_lock(&bc->lock);
btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
btree_node_data_free(c, b);
mutex_unlock(&bc->lock);
out:
six_unlock_write(&b->c.lock);
@ -1337,13 +1426,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
}
static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
const char *label, unsigned nr)
const char *label, size_t nr)
{
prt_printf(out, "%s\t", label);
prt_human_readable_u64(out, nr * c->opts.btree_node_size);
prt_printf(out, " (%u)\n", nr);
prt_printf(out, " (%zu)\n", nr);
}
static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
#define x(n) #n,
BCH_BTREE_CACHE_NOT_FREED_REASONS()
#undef x
NULL
};
void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
@ -1351,24 +1447,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
prt_btree_cache_line(out, c, "total:", bc->used);
prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
prt_newline(out);
prt_printf(out, "freed:\t%u\n", bc->freed);
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
prt_printf(out, "not freed:\n");
prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
prt_printf(out, " %s\t%llu\n",
bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
}

View File

@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
void bch2_node_pin(struct bch_fs *, struct btree *);
void bch2_btree_cache_unpin(struct bch_fs *);
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);

View File

@ -549,9 +549,8 @@ reconstruct_root:
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
r->b = NULL;
@ -753,10 +752,8 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
}
for_each_member_device(c, ca)
genradix_free(&ca->buckets_gc);
}
static int bch2_gc_start(struct bch_fs *c)
@ -910,20 +907,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
int ret = 0;
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
if (ret) {
bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
break;
}
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
buckets->nbuckets_minus_first =
buckets->nbuckets - buckets->first_bucket;
rcu_assign_pointer(ca->buckets_gc, buckets);
}
bch_err_fn(c, ret);

View File

@ -1666,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_btree_pos_to_text(&buf, c, b);
bch_err_ratelimited(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
@ -1749,10 +1749,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
ret = -BCH_ERR_btree_node_read_error;
@ -2031,7 +2029,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
do_write:
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
atomic_dec(&c->btree_cache.dirty);
atomic_long_dec(&c->btree_cache.nr_dirty);
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);

View File

@ -18,13 +18,13 @@ struct btree_node_read_all;
static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
atomic_inc(&c->btree_cache.dirty);
atomic_long_inc(&c->btree_cache.nr_dirty);
}
static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
atomic_dec(&c->btree_cache.dirty);
atomic_long_dec(&c->btree_cache.nr_dirty);
}
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)

View File

@ -6,6 +6,8 @@
#include "btree_types.h"
#include "trace.h"
#include <linux/sched/mm.h>
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
@ -529,6 +531,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
/**
* bch2_trans_kmalloc - allocate memory for use by the current transaction
*
* Must be called after bch2_trans_begin, which on second and further calls
* frees all memory allocated in this transaction
*/
static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
size = roundup(size, 8);
@ -865,29 +873,33 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
(_do) ?: bch2_trans_relock(_trans); \
})
#define allocate_dropping_locks_errcode(_trans, _do) \
({ \
gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
int _ret = _do; \
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_gfp = GFP_KERNEL; \
_ret = drop_locks_do(_trans, _do); \
} \
_ret; \
#define memalloc_flags_do(_flags, _do) \
({ \
unsigned _saved_flags = memalloc_flags_save(_flags); \
typeof(_do) _ret = _do; \
memalloc_noreclaim_restore(_saved_flags); \
_ret; \
})
#define allocate_dropping_locks(_trans, _ret, _do) \
({ \
gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
typeof(_do) _p = _do; \
\
_ret = 0; \
if (unlikely(!_p)) { \
_gfp = GFP_KERNEL; \
_ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
#define allocate_dropping_locks_errcode(_trans, _do) \
({ \
int _ret = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_ret = drop_locks_do(_trans, _do); \
} \
_ret; \
})
#define allocate_dropping_locks(_trans, _ret, _do) \
({ \
typeof(_do) _p = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\
\
_ret = 0; \
if (unlikely(!_p)) { \
_ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
})
#define bch2_trans_run(_c, _do) \

View File

@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys)
{
sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
cond_resched();
struct journal_key *dst = keys->data;
darray_for_each(*keys, src) {

View File

@ -116,8 +116,10 @@ static void bkey_cached_free(struct btree_key_cache *bc,
this_cpu_inc(*bc->nr_pending);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s)
{
gfp_t gfp = GFP_KERNEL|__GFP_ACCOUNT|__GFP_RECLAIMABLE;
struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
if (unlikely(!ck))
return NULL;
@ -145,7 +147,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
goto lock;
ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
__bkey_cached_alloc(key_u64s));
if (ret) {
if (ck)
kfree(ck->k);
@ -239,7 +241,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
kmalloc(key_u64s * sizeof(u64), _gfp));
kmalloc(key_u64s * sizeof(u64), GFP_KERNEL));
if (unlikely(!new_k)) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(ck->key.btree_id), key_u64s);

View File

@ -138,6 +138,31 @@ struct btree {
struct list_head list;
};
#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
x(lock_intent) \
x(lock_write) \
x(dirty) \
x(read_in_flight) \
x(write_in_flight) \
x(noevict) \
x(write_blocked) \
x(will_make_reachable) \
x(access_bit)
enum bch_btree_cache_not_freed_reasons {
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
BCH_BTREE_CACHE_NOT_FREED_REASONS()
#undef x
BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
};
struct btree_cache_list {
unsigned idx;
struct shrinker *shrink;
struct list_head list;
size_t nr;
};
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@ -155,28 +180,19 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct btree_cache_list live[2];
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
unsigned freed;
unsigned not_freed_lock_intent;
unsigned not_freed_lock_write;
unsigned not_freed_dirty;
unsigned not_freed_read_in_flight;
unsigned not_freed_write_in_flight;
unsigned not_freed_noevict;
unsigned not_freed_write_blocked;
unsigned not_freed_will_make_reachable;
unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker *shrink;
size_t nr_freeable;
size_t nr_reserve;
size_t nr_by_btree[BTREE_ID_NR];
atomic_long_t nr_dirty;
unsigned used_by_btree[BTREE_ID_NR];
/* shrinker stats */
size_t nr_freed;
u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
/*
* If we need to allocate memory for a new btree node and that
@ -189,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
u64 pinned_nodes_leaf_mask;
u64 pinned_nodes_interior_mask;
/* btree id mask: 0 for leaves, 1 for interior */
u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@ -582,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
x(never_write)
x(never_write) \
x(pinned)
enum btree_flags {
/* First bits for btree node write type */

View File

@ -16,6 +16,7 @@
#include "clock.h"
#include "error.h"
#include "extents.h"
#include "io_write.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
@ -145,7 +146,7 @@ fsck_err:
printbuf_exit(&buf);
return ret;
topology_repair:
if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
bch2_inconsistent_error(c);
ret = -BCH_ERR_btree_need_topology_repair;
@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
__btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
list_add_tail(&b->list, &c->btree_cache.live);
list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);

View File

@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
struct bch_dev_usage *usage)
{
if (out->nr_tabstops < 5) {
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 12);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
}
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
@ -100,12 +109,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (!ca) {
if (fsck_err(trans, ptr_to_invalid_device,
"pointer to missing device %u\n"
"while marking %s",
p.ptr.dev,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
trans, ptr_to_invalid_device,
"pointer to missing device %u\n"
"while marking %s",
p.ptr.dev,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
*do_update = true;
return 0;
}
@ -476,7 +486,7 @@ out:
return ret;
err:
bch2_dump_trans_updates(trans);
ret = -EIO;
ret = -BCH_ERR_bucket_ref_update;
goto out;
}
@ -562,8 +572,8 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
if (insert)
ret = -EIO;
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
ret = -BCH_ERR_trigger_pointer;
goto err;
}
@ -592,7 +602,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_trigger_pointer;
goto err_unlock;
}
@ -637,7 +647,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
ret = -EIO;
ret = -BCH_ERR_trigger_stripe_pointer;
goto err;
}
@ -676,7 +686,7 @@ err:
(u64) p.ec.idx, buf.buf);
printbuf_exit(&buf);
bch2_inconsistent_error(c);
return -EIO;
return -BCH_ERR_trigger_stripe_pointer;
}
m->block_sectors[p.ec.block] += sectors;
@ -740,7 +750,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
} else if (!p.has_ec) {
*replicas_sectors += disk_sectors;
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
@ -876,7 +886,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
need_rebalance_delta -= s != 0;
need_rebalance_sectors_delta -= s;
s = bch2_bkey_sectors_need_rebalance(c, old);
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
need_rebalance_delta += s != 0;
need_rebalance_sectors_delta += s;
@ -956,7 +966,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_type_str(a->v.data_type),
bch2_data_type_str(type),
bch2_data_type_str(type));
ret = -EIO;
ret = -BCH_ERR_metadata_bucket_inconsistency;
goto err;
}
@ -1012,7 +1022,7 @@ err:
bucket_unlock(g);
err_unlock:
percpu_up_read(&c->mark_lock);
return -EIO;
return -BCH_ERR_metadata_bucket_inconsistency;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,

View File

@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
TASK_UNINTERRUPTIBLE);
}
static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
{
return rcu_dereference_check(ca->buckets_gc,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
lockdep_is_held(&ca->fs->state_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
return NULL;
return buckets->b + b;
return genradix_ptr(&ca->buckets_gc, b);
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)

View File

@ -19,14 +19,6 @@ struct bucket {
u32 stripe_sectors;
} __aligned(sizeof(long));
struct bucket_array {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
size_t nbuckets_minus_first;
struct bucket b[] __counted_by(nbuckets);
};
struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;

View File

@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
int ret = crypto_skcipher_encrypt(req);
if (ret)
pr_err("got error %i from crypto_skcipher_encrypt()", ret);
@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
void *buf, size_t len)
{
if (!is_vmalloc_addr(buf)) {
struct scatterlist sg;
struct scatterlist sg = {};
sg_init_table(&sg, 1);
sg_set_page(&sg,
is_vmalloc_addr(buf)
? vmalloc_to_page(buf)
: virt_to_page(buf),
len, offset_in_page(buf));
sg_mark_end(&sg);
sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
return do_encrypt_sg(tfm, nonce, &sg, len);
} else {
unsigned pages = buf_pages(buf, len);
struct scatterlist *sg;
size_t orig_len = len;
int ret, i;
DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
size_t sgl_len = 0;
int ret;
sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
if (!sg)
return -BCH_ERR_ENOMEM_do_encrypt;
darray_init(&sgl);
sg_init_table(sg, pages);
for (i = 0; i < pages; i++) {
while (len) {
unsigned offset = offset_in_page(buf);
unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
struct scatterlist sg = {
.page_link = (unsigned long) vmalloc_to_page(buf),
.offset = offset,
.length = min(len, PAGE_SIZE - offset),
};
sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
buf += pg_len;
len -= pg_len;
if (darray_push(&sgl, sg)) {
sg_mark_end(&darray_last(sgl));
ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
if (ret)
goto err;
nonce = nonce_add(nonce, sgl_len);
sgl_len = 0;
sgl.nr = 0;
BUG_ON(darray_push(&sgl, sg));
}
buf += sg.length;
len -= sg.length;
sgl_len += sg.length;
}
ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
kfree(sg);
sg_mark_end(&darray_last(sgl));
ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
err:
darray_exit(&sgl);
return ret;
}
}
@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
struct scatterlist sgl[16], *sg = sgl;
size_t bytes = 0;
DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
size_t sgl_len = 0;
int ret = 0;
if (!bch2_csum_type_is_encryption(type))
return 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
darray_init(&sgl);
bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
struct scatterlist sg = {
.page_link = (unsigned long) bv.bv_page,
.offset = bv.bv_offset,
.length = bv.bv_len,
};
ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
if (darray_push(&sgl, sg)) {
sg_mark_end(&darray_last(sgl));
ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
if (ret)
return ret;
goto err;
nonce = nonce_add(nonce, bytes);
bytes = 0;
nonce = nonce_add(nonce, sgl_len);
sgl_len = 0;
sgl.nr = 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
sg = sgl;
BUG_ON(darray_push(&sgl, sg));
}
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
bytes += bv.bv_len;
}
if (sg != sgl) {
sg_mark_end(sg - 1);
return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
sgl_len += sg.length;
}
sg_mark_end(&darray_last(sgl));
ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
err:
darray_exit(&sgl);
return ret;
}

View File

@ -337,6 +337,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
ret = -EIO;
goto out;
}
@ -570,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}

View File

@ -18,6 +18,7 @@
#include "ec.h"
#include "error.h"
#include "io_read.h"
#include "io_write.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
bch2_prt_csum_type(out, s.csum_type);
prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
if (s.disk_label) {
prt_str(out, " label");
bch2_disk_path_to_text(out, c, s.disk_label - 1);
}
for (unsigned i = 0; i < s.nr_blocks; i++) {
const struct bch_extent_ptr *ptr = sp->ptrs + i;
if ((void *) ptr >= bkey_val_end(k))
break;
prt_char(out, ' ');
bch2_extent_ptr_to_text(out, c, ptr);
if (s.csum_type < BCH_CSUM_NR &&
@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_mark_stripe;
goto err;
}
@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_mark_stripe;
goto err;
}
} else {
@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_mark_stripe;
goto err;
}
@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_mark_stripe;
goto err;
}
@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_mark_stripe;
goto err;
}
}
@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
if (!(flags & BTREE_TRIGGER_overwrite))
ret = -EIO;
if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
ret = -BCH_ERR_mark_stripe;
goto err;
}
@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -EIO;
ret = -BCH_ERR_mark_stripe;
goto err_unlock;
}
@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
{
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
m->nr_redundant = s->nr_redundant;
m->disk_label = s->disk_label;
m->blocks_nonempty = 0;
for (unsigned i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
}
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
memset(m, 0, sizeof(*m));
} else {
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
m->nr_blocks = new_s->nr_blocks;
m->nr_redundant = new_s->nr_redundant;
m->blocks_nonempty = 0;
for (unsigned i = 0; i < new_s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
stripe_to_mem(m, new_s);
if (!old_s)
bch2_stripes_heap_insert(c, m, idx);
@ -816,13 +829,15 @@ err:
}
/* recovery read path: */
int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf;
struct ec_stripe_buf *buf = NULL;
struct closure cl;
struct bch_stripe *v;
unsigned i, offset;
const char *msg = NULL;
int ret = 0;
closure_init_stack(&cl);
@ -835,32 +850,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret);
kfree(buf);
return -EIO;
msg = "stripe not found";
goto err;
}
v = &bkey_i_to_stripe(&buf->key)->v;
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
bch_err_ratelimited(c,
"error doing reconstruct read: pointer doesn't match stripe");
ret = -EIO;
msg = "pointer doesn't match stripe";
goto err;
}
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
bch_err_ratelimited(c,
"error doing reconstruct read: read is bigger than stripe");
ret = -EIO;
msg = "read is bigger than stripe";
goto err;
}
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
if (ret)
if (ret) {
msg = "-ENOMEM";
goto err;
}
for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
@ -868,9 +879,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
ret = -EIO;
msg = "unable to read enough blocks";
goto err;
}
@ -882,20 +891,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
err:
out:
ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
err:
struct printbuf msgbuf = PRINTBUF;
bch2_bkey_val_to_text(&msgbuf, c, orig_k);
bch_err_ratelimited(c,
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
printbuf_exit(&msgbuf);;
ret = -BCH_ERR_stripe_reconstruct;
goto out;
}
/* stripe bucket accounting: */
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx)
{
ec_stripes_heap n, *h = &c->ec_stripes_heap;
if (idx >= h->size) {
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), GFP_KERNEL))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
mutex_lock(&c->ec_stripes_heap_lock);
@ -909,11 +926,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
free_heap(&n);
}
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
if (!genradix_ptr_alloc(&c->stripes, idx, GFP_KERNEL))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
!genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
return 0;
@ -923,7 +940,7 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
struct btree_iter *iter)
{
return allocate_dropping_locks_errcode(trans,
__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
__ec_stripe_mem_alloc(trans->c, iter->pos.offset));
}
/*
@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bkey_reassemble(n, k);
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
BUG_ON(!ec_ptr);
@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s = h->s;
lockdep_assert_held(&h->lock);
BUG_ON(!s->allocated && !s->err);
h->s = NULL;
@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
ec_stripe_new_put(c, s, STRIPE_REF_io);
}
static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
{
h->s->err = err;
ec_stripe_new_set_pending(c, h);
}
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
{
struct ec_stripe_new *s = ob->ec;
@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
struct bkey_i *k,
unsigned nr_data,
unsigned nr_parity,
unsigned stripe_size)
unsigned stripe_size,
unsigned disk_label)
{
struct bkey_i_stripe *s = bkey_stripe_init(k);
unsigned u64s;
@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
s->v.nr_redundant = nr_parity;
s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
s->v.csum_type = BCH_CSUM_crc32c;
s->v.pad = 0;
s->v.disk_label = disk_label;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
BUG_ON(1 << s->v.csum_granularity_bits >=
@ -1685,14 +1711,65 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s->nr_parity = h->redundancy;
ec_stripe_key_init(c, &s->new_stripe.key,
s->nr_data, s->nr_parity, h->blocksize);
s->nr_data, s->nr_parity,
h->blocksize, h->disk_label);
h->s = s;
h->nr_created++;
return 0;
}
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
? group_to_target(h->disk_label - 1)
: 0);
unsigned nr_devs = dev_mask_nr(&h->devs);
for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
__clear_bit(ca->dev_idx, h->devs.d);
unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
h->blocksize = pick_blocksize(c, &h->devs);
h->nr_active_devs = 0;
for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
rcu_read_unlock();
/*
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
if (h->insufficient_devs) {
const char *err;
if (nr_devs < h->redundancy + 2)
err = NULL;
else if (nr_devs_with_durability < h->redundancy + 2)
err = "cannot use durability=0 devices";
else
err = "mismatched bucket sizes";
if (err)
bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
h->nr_active_devs, h->redundancy + 2, err);
}
if (h->s && !h->s->allocated)
ec_stripe_new_cancel(c, h, -EINTR);
h->rw_devs_change_count = c->rw_devs_change_count;
}
static struct ec_stripe_head *
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
unsigned algo, unsigned redundancy,
enum bch_watermark watermark)
{
@ -1705,34 +1782,11 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
mutex_init(&h->lock);
BUG_ON(!mutex_trylock(&h->lock));
h->target = target;
h->disk_label = disk_label;
h->algo = algo;
h->redundancy = redundancy;
h->watermark = watermark;
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
__clear_bit(ca->dev_idx, h->devs.d);
h->blocksize = pick_blocksize(c, &h->devs);
for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
rcu_read_unlock();
/*
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
if (h->nr_active_devs < h->redundancy + 2)
bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
h->nr_active_devs, h->redundancy + 2);
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
@ -1743,14 +1797,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data) == h->s->nr_data)
ec_stripe_set_pending(c, h);
ec_stripe_new_set_pending(c, h);
mutex_unlock(&h->lock);
}
static struct ec_stripe_head *
__bch2_ec_stripe_head_get(struct btree_trans *trans,
unsigned target,
unsigned disk_label,
unsigned algo,
unsigned redundancy,
enum bch_watermark watermark)
@ -1768,27 +1822,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
goto found;
goto err;
}
list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
if (h->disk_label == disk_label &&
h->algo == algo &&
h->redundancy == redundancy &&
h->watermark == watermark) {
ret = bch2_trans_mutex_lock(trans, &h->lock);
if (ret)
if (ret) {
h = ERR_PTR(ret);
goto err;
}
goto found;
}
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
found:
if (!IS_ERR_OR_NULL(h) &&
h->nr_active_devs < h->redundancy + 2) {
if (h->rw_devs_change_count != c->rw_devs_change_count)
ec_stripe_head_devs_update(c, h);
if (h->insufficient_devs) {
mutex_unlock(&h->lock);
h = NULL;
}
err:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@ -1796,38 +1855,39 @@ found:
static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
enum bch_watermark watermark, struct closure *cl)
{
struct ec_stripe_new *s = h->s;
struct bch_fs *c = trans->c;
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
int ret = 0;
BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
BUG_ON(v->nr_redundant != h->s->nr_parity);
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
BUG_ON(v->nr_redundant != s->nr_parity);
/* * We bypass the sector allocator which normally does this: */
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
__clear_bit(v->ptrs[i].dev, devs.d);
if (i < h->s->nr_data)
if (i < s->nr_data)
nr_have_data++;
else
nr_have_parity++;
}
BUG_ON(nr_have_data > h->s->nr_data);
BUG_ON(nr_have_parity > h->s->nr_parity);
BUG_ON(nr_have_data > s->nr_data);
BUG_ON(nr_have_parity > s->nr_parity);
buckets.nr = 0;
if (nr_have_parity < h->s->nr_parity) {
if (nr_have_parity < s->nr_parity) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->parity_stripe,
&devs,
h->s->nr_parity,
s->nr_parity,
&nr_have_parity,
&have_cache, 0,
BCH_DATA_parity,
@ -1835,14 +1895,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
j = find_next_zero_bit(h->s->blocks_gotten,
h->s->nr_data + h->s->nr_parity,
h->s->nr_data);
BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data + s->nr_parity,
s->nr_data);
BUG_ON(j >= s->nr_data + s->nr_parity);
h->s->blocks[j] = buckets.v[i];
s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, h->s->blocks_gotten);
__set_bit(j, s->blocks_gotten);
}
if (ret)
@ -1850,11 +1910,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
}
buckets.nr = 0;
if (nr_have_data < h->s->nr_data) {
if (nr_have_data < s->nr_data) {
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->block_stripe,
&devs,
h->s->nr_data,
s->nr_data,
&nr_have_data,
&have_cache, 0,
BCH_DATA_user,
@ -1862,13 +1922,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
cl);
open_bucket_for_each(c, &buckets, ob, i) {
j = find_next_zero_bit(h->s->blocks_gotten,
h->s->nr_data, 0);
BUG_ON(j >= h->s->nr_data);
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data, 0);
BUG_ON(j >= s->nr_data);
h->s->blocks[j] = buckets.v[i];
s->blocks[j] = buckets.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, h->s->blocks_gotten);
__set_bit(j, s->blocks_gotten);
}
if (ret)
@ -1878,7 +1938,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
return 0;
}
/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
struct ec_stripe_head *head)
{
@ -1901,7 +1960,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
m = genradix_ptr(&c->stripes, stripe_idx);
if (m->algorithm == head->algo &&
if (m->disk_label == head->disk_label &&
m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
@ -1914,12 +1974,53 @@ static s64 get_existing_stripe(struct bch_fs *c,
return ret;
}
static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
{
struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
unsigned i;
BUG_ON(existing_v->nr_redundant != s->nr_parity);
s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
bch2_stripe_close(c, s);
return ret;
}
BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
/*
* Free buckets we initially allocated - they might conflict with
* blocks from the stripe we're reusing:
*/
for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
s->blocks[i] = 0;
}
memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
for (i = 0; i < existing_v->nr_blocks; i++) {
if (stripe_blockcount_get(existing_v, i)) {
__set_bit(i, s->blocks_gotten);
__set_bit(i, s->blocks_allocated);
}
ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
}
bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
s->have_existing_stripe = true;
return 0;
}
static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
{
struct bch_fs *c = trans->c;
struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
struct bch_stripe *existing_v;
unsigned i;
s64 idx;
int ret;
@ -1939,45 +2040,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
return ret;
}
existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
h->s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
if (ret) {
bch2_stripe_close(c, h->s);
return ret;
}
BUG_ON(h->s->existing_stripe.size != h->blocksize);
BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
/*
* Free buckets we initially allocated - they might conflict with
* blocks from the stripe we're reusing:
*/
for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
h->s->blocks[i] = 0;
}
memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
for (i = 0; i < existing_v->nr_blocks; i++) {
if (stripe_blockcount_get(existing_v, i)) {
__set_bit(i, h->s->blocks_gotten);
__set_bit(i, h->s->blocks_allocated);
}
ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
}
bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
h->s->have_existing_stripe = true;
return 0;
return init_new_stripe_from_existing(c, h->s);
}
static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
@ -2046,9 +2109,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
bool waiting = false;
unsigned disk_label = 0;
struct target t = target_decode(target);
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
if (t.type == TARGET_GROUP) {
if (t.group > U8_MAX) {
bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
return NULL;
}
disk_label = t.group + 1; /* 0 == no label */
}
h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
if (IS_ERR_OR_NULL(h))
return h;
@ -2126,6 +2199,79 @@ err:
return ERR_PTR(ret);
}
/* device removal */
static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
{
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
if (!a->stripe)
return 0;
if (a->stripe_sectors) {
bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
return -BCH_ERR_invalidate_stripe_to_dev;
}
struct btree_iter iter;
struct bkey_i_stripe *s =
bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
BTREE_ITER_slots, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_replicas,
};
s64 sectors = 0;
for (unsigned i = 0; i < s->v.nr_blocks; i++)
sectors -= stripe_blockcount_get(&s->v, i);
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
if (ret)
goto err;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
bkey_for_each_ptr(ptrs, ptr)
if (ptr->dev == k_a.k->p.inode) {
if (stripe_blockcount_get(&s->v, ptr - &ptrs.start->ptr)) {
bch_err(trans->c, "trying to invalidate device in stripe when stripe block not empty");
ret = -BCH_ERR_invalidate_stripe_to_dev;
goto err;
}
ptr->dev = BCH_SB_MEMBER_INVALID;
}
sectors = -sectors;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
if (ret)
goto err;
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_dev_remove_stripes(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_trans_run(c,
for_each_btree_key_upto_commit(trans, iter,
BTREE_ID_alloc, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX),
BTREE_ITER_intent, k,
NULL, NULL, 0, ({
bch2_invalidate_stripe_to_dev(trans, k);
})));
}
/* startup/shutdown */
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
@ -2151,8 +2297,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
}
goto unlock;
found:
h->s->err = -BCH_ERR_erofs_no_writes;
ec_stripe_set_pending(c, h);
ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
unlock:
mutex_unlock(&h->lock);
}
@ -2193,21 +2338,13 @@ int bch2_stripes_read(struct bch_fs *c)
if (k.k->type != KEY_TYPE_stripe)
continue;
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
ret = __ec_stripe_mem_alloc(c, k.k->p.offset);
if (ret)
break;
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
m->nr_redundant = s->nr_redundant;
m->blocks_nonempty = 0;
for (unsigned i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
bch2_stripes_heap_insert(c, m, k.k->p.offset);
0;
@ -2252,6 +2389,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
prt_printf(out, " %u", s->blocks[i]);
prt_newline(out);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
prt_newline(out);
}
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
@ -2261,9 +2400,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
prt_printf(out, "target %u algo %u redundancy %u %s:\n",
h->target, h->algo, h->redundancy,
bch2_watermarks[h->watermark]);
prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
h->disk_label, h->algo, h->redundancy,
bch2_watermarks[h->watermark],
h->nr_created);
if (h->s)
bch2_new_stripe_to_text(out, c, h->s);

View File

@ -97,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
{
return data_ptr->dev == stripe_ptr->dev &&
return (data_ptr->dev == stripe_ptr->dev ||
data_ptr->dev == BCH_SB_MEMBER_INVALID ||
stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
data_ptr->gen == stripe_ptr->gen &&
data_ptr->offset >= stripe_ptr->offset &&
data_ptr->offset < stripe_ptr->offset + sectors;
@ -186,10 +188,15 @@ struct ec_stripe_head {
struct list_head list;
struct mutex lock;
unsigned target;
unsigned disk_label;
unsigned algo;
unsigned redundancy;
enum bch_watermark watermark;
bool insufficient_devs;
unsigned long rw_devs_change_count;
u64 nr_created;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@ -202,7 +209,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
@ -247,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
int bch2_dev_remove_stripes(struct bch_fs *, struct bch_dev *);
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
void bch2_fs_ec_flush(struct bch_fs *);

View File

@ -11,7 +11,14 @@ struct bch_stripe {
__u8 csum_granularity_bits;
__u8 csum_type;
__u8 pad;
/*
* XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
*
* we can manage with this because this only needs to point to a
* disk label, not a target:
*/
__u8 disk_label;
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);

View File

@ -16,6 +16,7 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
u8 blocks_nonempty;
u8 disk_label;
};
struct gc_stripe {

View File

@ -119,8 +119,8 @@
x(EEXIST, EEXIST_str_hash_set) \
x(EEXIST, EEXIST_discard_in_flight_add) \
x(EEXIST, EEXIST_subvolume_create) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(ENOSPC, open_buckets_empty) \
x(ENOSPC, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
x(0, transaction_restart) \
x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
@ -244,6 +244,16 @@
x(EIO, btree_node_read_error) \
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(EIO, bucket_ref_update) \
x(EIO, trigger_pointer) \
x(EIO, trigger_stripe_pointer) \
x(EIO, metadata_bucket_inconsistency) \
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@ -257,7 +267,6 @@
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, need_inode_lock) \
x(0, invalid_snapshot_node) \
x(0, option_needs_open_fs)

View File

@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
if (k.k->type == KEY_TYPE_error)
return -EIO;
return -BCH_ERR_key_type_error;
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
* read:
*/
if (!ret && !p.ptr.cached)
ret = -EIO;
ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
? f->idx
: f->idx + 1;
if (!p.idx && !ca)
if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
p.idx++;
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
if (!p.idx && !bch2_dev_is_readable(ca))
p.idx++;
if (p.idx >= (unsigned) p.has_ec + 1)
if (p.idx > (unsigned) p.has_ec)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
@ -781,14 +778,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
/*
* Returns pointer to the next entry after the one being dropped:
*/
union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
struct bch_extent_ptr *ptr)
void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
union bch_extent_entry *ret = entry;
bool drop_crc = true;
if (k.k->type == KEY_TYPE_stripe) {
ptr->dev = BCH_SB_MEMBER_INVALID;
return;
}
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
@ -811,21 +811,28 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
break;
if ((extent_entry_is_crc(entry) && drop_crc) ||
extent_entry_is_stripe_ptr(entry)) {
ret = (void *) ret - extent_entry_bytes(entry);
extent_entry_is_stripe_ptr(entry))
extent_entry_drop(k, entry);
}
}
return ret;
}
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
struct bch_extent_ptr *ptr)
void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
{
if (k.k->type != KEY_TYPE_stripe) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev == ptr->dev && p.has_ec) {
ptr->dev = BCH_SB_MEMBER_INVALID;
return;
}
}
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
union bch_extent_entry *ret =
bch2_bkey_drop_ptr_noerror(k, ptr);
bch2_bkey_drop_ptr_noerror(k, ptr);
/*
* If we deleted all the dirty pointers and there's still cached
@ -837,14 +844,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
!bch2_bkey_dirty_devs(k.s_c).nr) {
k.k->type = KEY_TYPE_error;
set_bkey_val_u64s(k.k, 0);
ret = NULL;
} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
k.k->type = KEY_TYPE_deleted;
set_bkey_val_u64s(k.k, 0);
ret = NULL;
}
return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@ -854,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
{
struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
if (ptr)
bch2_bkey_drop_ptr_noerror(k, ptr);
bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
if (p1.ptr.dev == p2.ptr.dev &&
p1.ptr.gen == p2.ptr.gen &&
/*
* This checks that the two pointers point
* to the same region on disk - adjusting
* for the difference in where the extents
* start, since one may have been trimmed:
*/
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
/*
* This additionally checks that the
* extents overlap on disk, since the
* previous check may trigger spuriously
* when one extent is immediately partially
* overwritten with another extent (so that
* on disk they are adjacent) and
* compression is in use:
*/
((p1.ptr.offset >= p2.ptr.offset &&
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
(p2.ptr.offset >= p1.ptr.offset &&
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
return true;
return false;

View File

@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d
unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
void bch2_bkey_drop_device(struct bkey_s, unsigned);
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
@ -649,26 +646,38 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
void bch2_bkey_drop_device(struct bkey_s, unsigned);
#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
do { \
__label__ _again; \
struct bkey_ptrs _ptrs; \
_again: \
_ptrs = bch2_bkey_ptrs(_k); \
\
bkey_for_each_ptr(_ptrs, _ptr) \
if (_cond) { \
bch2_bkey_drop_ptr_noerror(_k, _ptr); \
goto _again; \
} \
} while (0)
#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
do { \
struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
__label__ _again; \
struct bkey_ptrs _ptrs; \
_again: \
_ptrs = bch2_bkey_ptrs(_k); \
\
struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \
\
while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
bkey_for_each_ptr(_ptrs, _ptr) \
if (_cond) { \
_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
_ptrs = bch2_bkey_ptrs(_k); \
continue; \
bch2_bkey_drop_ptr(_k, _ptr); \
goto _again; \
} \
\
(_ptr)++; \
} \
} while (0)
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,

View File

@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
BTREE_ITER_intent|BTREE_ITER_with_updates);
if (ret)
goto err;
@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
STR_HASH_must_create);
STR_HASH_must_create|BTREE_ITER_with_updates);
if (ret)
goto err;

View File

@ -791,8 +791,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
loff_t pos, unsigned len,
bool inode_locked)
loff_t pos, unsigned len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@ -816,15 +815,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
/*
* If we're not using the inode lock, we need to lock all the folios for
* atomiticity of writes vs. other writes:
*/
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
ret = -BCH_ERR_need_inode_lock;
goto out;
}
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@ -921,10 +911,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
if (end > inode->v.i_size) {
BUG_ON(!inode_locked);
if (end > inode->v.i_size)
i_size_write(&inode->v, end);
}
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@ -968,68 +956,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
loff_t pos;
bool inode_locked = false;
ssize_t written = 0, written2 = 0, ret = 0;
/*
* We don't take the inode lock unless i_size will be changing. Folio
* locks provide exclusion with other writes, and the pagecache add lock
* provides exclusion with truncate and hole punching.
*
* There is one nasty corner case where atomicity would be broken
* without great care: when copying data from userspace to the page
* cache, we do that with faults disable - a page fault would recurse
* back into the filesystem, taking filesystem locks again, and
* deadlock; so it's done with faults disabled, and we fault in the user
* buffer when we aren't holding locks.
*
* If we do part of the write, but we then race and in the userspace
* buffer have been evicted and are no longer resident, then we have to
* drop our folio locks to re-fault them in, breaking write atomicity.
*
* To fix this, we restart the write from the start, if we weren't
* holding the inode lock.
*
* There is another wrinkle after that; if we restart the write from the
* start, and then get an unrecoverable error, we _cannot_ claim to
* userspace that we did not write data we actually did - so we must
* track (written2) the most we ever wrote.
*/
if ((iocb->ki_flags & IOCB_APPEND) ||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
inode_lock(&inode->v);
inode_locked = true;
}
ret = generic_write_checks(iocb, iter);
if (ret <= 0)
goto unlock;
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
if (ret) {
if (!inode_locked) {
inode_lock(&inode->v);
inode_locked = true;
ret = file_remove_privs_flags(file, 0);
}
if (ret)
goto unlock;
}
ret = file_update_time(file);
if (ret)
goto unlock;
pos = iocb->ki_pos;
loff_t pos = iocb->ki_pos;
ssize_t written = 0;
int ret = 0;
bch2_pagecache_add_get(inode);
if (!inode_locked &&
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
goto get_inode_lock;
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@ -1054,17 +986,12 @@ again:
}
}
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
goto get_inode_lock;
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
if (ret == -BCH_ERR_need_inode_lock)
goto get_inode_lock;
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
if (unlikely(ret < 0))
break;
@ -1085,46 +1012,50 @@ again:
}
pos += ret;
written += ret;
written2 = max(written, written2);
if (ret != bytes && !inode_locked)
goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
if (0) {
get_inode_lock:
bch2_pagecache_add_put(inode);
inode_lock(&inode->v);
inode_locked = true;
bch2_pagecache_add_get(inode);
iov_iter_revert(iter, written);
pos -= written;
written = 0;
ret = 0;
}
} while (iov_iter_count(iter));
bch2_pagecache_add_put(inode);
unlock:
if (inode_locked)
inode_unlock(&inode->v);
iocb->ki_pos += written;
ret = max(written, written2) ?: ret;
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
return written ? written : ret;
}
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
? bch2_direct_write(iocb, iter)
: bch2_buffered_write(iocb, iter);
struct file *file = iocb->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
ssize_t ret;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = bch2_direct_write(iocb, from);
goto out;
}
inode_lock(&inode->v);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto unlock;
ret = file_remove_privs(file);
if (ret)
goto unlock;
ret = file_update_time(file);
if (ret)
goto unlock;
ret = bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
unlock:
inode_unlock(&inode->v);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
out:
return bch2_err_class(ret);
}

View File

@ -273,14 +273,6 @@ retry:
}
}
#define memalloc_flags_do(_flags, _do) \
({ \
unsigned _saved_flags = memalloc_flags_save(_flags); \
typeof(_do) _ret = _do; \
memalloc_noreclaim_restore(_saved_flags); \
_ret; \
})
static struct inode *bch2_alloc_inode(struct super_block *sb)
{
BUG();
@ -380,6 +372,8 @@ __bch2_create(struct mnt_idmap *idmap,
subvol_inum inum;
struct bch_subvolume subvol;
u64 journal_seq = 0;
kuid_t kuid;
kgid_t kgid;
int ret;
/*
@ -406,13 +400,15 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
from_kuid(i_user_ns(&dir->v), current_fsuid()),
from_kgid(i_user_ns(&dir->v), current_fsgid()),
from_kuid(i_user_ns(&dir->v), kuid),
from_kgid(i_user_ns(&dir->v), kgid),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@ -727,15 +723,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
struct bch_inode_unpacked src_inode_u, dst_inode_u;
struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
struct btree_trans *trans;
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
bool whiteout = !!(flags & RENAME_WHITEOUT);
int ret;
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
return -EINVAL;
if (mode == BCH_RENAME_OVERWRITE) {
@ -776,18 +773,48 @@ static int bch2_rename2(struct mnt_idmap *idmap,
if (ret)
goto err;
}
retry:
bch2_trans_begin(trans);
ret = commit_do(trans, NULL, NULL, 0,
bch2_rename_trans(trans,
inode_inum(src_dir), &src_dir_u,
inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
&dst_inode_u,
&src_dentry->d_name,
&dst_dentry->d_name,
mode));
ret = bch2_rename_trans(trans,
inode_inum(src_dir), &src_dir_u,
inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
&dst_inode_u,
&src_dentry->d_name,
&dst_dentry->d_name,
mode);
if (unlikely(ret))
goto err_tx_restart;
if (whiteout) {
whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
if (unlikely(ret))
goto err_tx_restart;
bch2_inode_init_early(c, whiteout_inode_u);
ret = bch2_create_trans(trans,
inode_inum(src_dir), &src_dir_u,
whiteout_inode_u,
&src_dentry->d_name,
from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
S_IFCHR|WHITEOUT_MODE, 0,
NULL, NULL, (subvol_inum) { 0 }, 0) ?:
bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
KEY_TYPE_QUOTA_PREALLOC);
if (unlikely(ret))
goto err_tx_restart;
}
ret = bch2_trans_commit(trans, NULL, NULL, 0);
if (unlikely(ret)) {
err_tx_restart:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
goto err;
}
BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
BUG_ON(dst_inode &&
@ -835,11 +862,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
kuid_t kuid;
kgid_t kgid;
if (ia_valid & ATTR_UID)
bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
if (ia_valid & ATTR_GID)
bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
if (ia_valid & ATTR_UID) {
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
}
if (ia_valid & ATTR_GID) {
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
}
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@ -854,11 +887,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
kgid_t gid = ia_valid & ATTR_GID
? attr->ia_gid
? kgid
: inode->v.i_gid;
if (!in_group_p(gid) &&
!capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
if (!in_group_or_capable(idmap, &inode->v,
make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
@ -874,17 +907,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
kuid_t kuid;
kgid_t kgid;
int ret;
mutex_lock(&inode->ei_update_lock);
qid = inode->ei_qid;
if (attr->ia_valid & ATTR_UID)
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
if (attr->ia_valid & ATTR_UID) {
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
}
if (attr->ia_valid & ATTR_GID)
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
if (attr->ia_valid & ATTR_GID) {
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
}
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@ -940,13 +979,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
stat->dev = inode->v.i_sb->s_dev;
stat->ino = inode->v.i_ino;
stat->mode = inode->v.i_mode;
stat->nlink = inode->v.i_nlink;
stat->uid = inode->v.i_uid;
stat->gid = inode->v.i_gid;
stat->uid = vfsuid_into_kuid(vfsuid);
stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
stat->atime = inode_get_atime(&inode->v);
@ -1865,30 +1906,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
enum bch_opt_id i;
struct printbuf buf = PRINTBUF;
int ret = 0;
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
seq_puts(seq, buf.buf);
if ((opt->flags & OPT_HIDDEN) ||
!(opt->flags & OPT_MOUNT))
continue;
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
continue;
printbuf_reset(&buf);
bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
OPT_SHOW_MOUNT_STYLE);
seq_putc(seq, ',');
seq_puts(seq, buf.buf);
}
if (buf.allocation_failure)
ret = -ENOMEM;
int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
return ret;
}
@ -2209,7 +2233,7 @@ static struct file_system_type bcache_fs_type = {
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("bcachefs");

View File

@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
orig_k->k->k.size,
reflink_offset);
bch2_inconsistent_error(trans->c);
ret = -EIO;
ret = -BCH_ERR_missing_indirect_extent;
goto err;
}
@ -869,9 +869,15 @@ retry_pick:
goto hole;
if (pick_ret < 0) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err_inum_offset_ratelimited(c,
read_pos.inode, read_pos.offset << 9,
"no device to read from");
"no device to read from: %s\n %s",
bch2_err_str(pick_ret),
buf.buf);
printbuf_exit(&buf);
goto err;
}
@ -1086,7 +1092,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio)) {
if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}

View File

@ -1447,9 +1447,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
BCH_WRITE_ONLY_SPECIFIED_DEVS))
? NULL : &op->cl, &wp));
&op->cl, &wp));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
op->flags |= BCH_WRITE_ALLOC_NOWAIT;
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);

View File

@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
.e.nr_devs = 0,
.e.nr_required = 1,
};
@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
goto err;
darray_for_each(i->ptrs, ptr)
replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e);

View File

@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));

View File

@ -432,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out,
else
prt_str(out, opt->choices[v]);
break;
case BCH_OPT_BITFIELD:
prt_bitflags(out, opt->choices, v);
break;
case BCH_OPT_FN:
opt->fn.to_text(out, c, sb, v);
break;
@ -440,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out,
}
}
void bch2_opts_to_text(struct printbuf *out,
struct bch_opts opts,
struct bch_fs *c, struct bch_sb *sb,
unsigned show_mask, unsigned hide_mask,
unsigned flags)
{
bool first = true;
for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
continue;
u64 v = bch2_opt_get_by_id(&opts, i);
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
continue;
if (!first)
prt_char(out, ',');
first = false;
bch2_opt_to_text(out, c, sb, opt, v, flags);
}
}
int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
{
int ret = 0;

View File

@ -373,6 +373,16 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Exit recovery immediately prior to journal replay")\
x(recovery_passes, u64, \
OPT_FS|OPT_MOUNT, \
OPT_BITFIELD(bch2_recovery_passes), \
BCH2_NO_SB_OPT, 0, \
NULL, "Recovery passes to run explicitly") \
x(recovery_passes_exclude, u64, \
OPT_FS|OPT_MOUNT, \
OPT_BITFIELD(bch2_recovery_passes), \
BCH2_NO_SB_OPT, 0, \
NULL, "Recovery passes to exclude") \
x(recovery_pass_last, u8, \
OPT_FS|OPT_MOUNT, \
OPT_STR_NOLIMIT(bch2_recovery_passes), \
@ -595,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
void bch2_opts_to_text(struct printbuf *,
struct bch_opts,
struct bch_fs *, struct bch_sb *,
unsigned, unsigned, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);

View File

@ -219,9 +219,9 @@ static noinline void __process_finished_items(struct rcu_pending *pending,
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
kvfree(ptr);
bool free_head = ((unsigned long) obj->func) & 1UL;
kvfree(ptr);
if (free_head)
kfree(obj);
}

View File

@ -13,6 +13,7 @@
#include "errcode.h"
#include "error.h"
#include "inode.h"
#include "io_write.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->rewrite_ptrs =
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
data_opts->target = r->target;
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
if (!data_opts->rewrite_ptrs) {
/*
@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
data_opts->target = target;
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
return data_opts->rewrite_ptrs != 0;
}

View File

@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c)
"error reading btree root %s l=%u: %s",
bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
if (btree_id_is_alloc(i)) {
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
r->error = 0;
} else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
} else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
bch_info(c, "will run btree node scan");
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
}
ret = 0;
@ -706,14 +706,14 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);

View File

@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
return bch2_fs_read_write_early(c);
return 0;
}
@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v)
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
enum bch_recovery_pass pass)
{
if (c->recovery_passes_explicit & BIT_ULL(pass))
if (c->opts.recovery_passes & BIT_ULL(pass))
return 0;
bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
bch2_recovery_passes[pass], pass,
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
c->recovery_passes_explicit |= BIT_ULL(pass);
c->opts.recovery_passes |= BIT_ULL(pass);
if (c->curr_recovery_pass >= pass) {
c->curr_recovery_pass = pass;
@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
{
struct recovery_pass_fn *p = recovery_pass_fns + pass;
if (c->recovery_passes_explicit & BIT_ULL(pass))
if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
return false;
if (c->opts.recovery_passes & BIT_ULL(pass))
return true;
if ((p->when & PASS_FSCK) && c->opts.fsck)
return true;

View File

@ -82,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
}
for (unsigned i = 0; i < r->nr_devs; i++)
if (!bch2_member_exists(sb, r->devs[i])) {
if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
!bch2_member_exists(sb, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
@ -122,7 +123,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (!p.has_ec)
r->devs[r->nr_devs++] = p.ptr.dev;
replicas_entry_add_dev(r, p.ptr.dev);
else
r->nr_required = 0;
}
@ -139,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
r->devs[r->nr_devs++] = ptr->dev;
replicas_entry_add_dev(r, ptr->dev);
}
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@ -180,7 +181,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
e->nr_required = 1;
darray_for_each(devs, i)
e->devs[e->nr_devs++] = *i;
replicas_entry_add_dev(e, *i);
bch2_replicas_entry_sort(e);
}
@ -795,11 +796,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
nr_online += test_bit(e->devs[i], devs.d);
struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
rcu_read_unlock();
if (nr_failed == e->nr_devs)
if (nr_online + nr_failed == e->nr_devs)
continue;
if (nr_online < e->nr_required)

View File

@ -5,7 +5,7 @@
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
__u8 devs[];
__u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas_v0 {
@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
__u8 devs[];
__u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas {
@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
#define replicas_entry_add_dev(e, d) ({ \
(e)->nr_devs++; \
(e)->devs[(e)->nr_devs - 1] = (d); \
})
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */

View File

@ -288,10 +288,10 @@ enum bch_fsck_flags {
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
x(accounting_key_junk_at_end, 277, 0) \
x(accounting_key_replicas_nr_devs_0, 278, 0) \
x(accounting_key_replicas_nr_required_bad, 279, 0) \
x(accounting_key_replicas_devs_unsorted, 280, 0) \
x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,

View File

@ -11,7 +11,8 @@
void bch2_dev_missing(struct bch_fs *c, unsigned dev)
{
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
if (dev != BCH_SB_MEMBER_INVALID)
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
}
void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
@ -473,3 +474,51 @@ unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
nr += bch2_member_exists((struct bch_sb *) sb, i);
return nr;
}
int bch2_sb_member_alloc(struct bch_fs *c)
{
unsigned dev_idx = c->sb.nr_devices;
struct bch_sb_field_members_v2 *mi;
unsigned nr_devices;
unsigned u64s;
int best = -1;
u64 best_last_mount = 0;
if (dev_idx < BCH_SB_MEMBERS_MAX)
goto have_slot;
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
/* eventually BCH_SB_MEMBERS_MAX will be raised */
if (dev_idx == BCH_SB_MEMBER_INVALID)
continue;
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
if (bch2_member_alive(&m))
continue;
u64 last_mount = le64_to_cpu(m.last_mount);
if (best < 0 || last_mount < best_last_mount) {
best = dev_idx;
best_last_mount = last_mount;
}
}
if (best >= 0) {
dev_idx = best;
goto have_slot;
}
return -BCH_ERR_ENOSPC_sb_members;
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi)
return -BCH_ERR_ENOSPC_sb_members;
c->disk_sb.sb->nr_devices = nr_devices;
return dev_idx;
}

View File

@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
lockdep_is_held(&c->state_lock));
}
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
{
return c && dev < c->sb.nr_devices
? rcu_dereference(c->devs[dev])
: NULL;
}
void bch2_dev_missing(struct bch_fs *, unsigned);
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (unlikely(!ca))
bch2_dev_missing(c, dev);
return ca;
}
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
rcu_read_unlock();
return ca;
}
void bch2_dev_missing(struct bch_fs *, unsigned);
static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
if (!ca)
if (unlikely(!ca))
bch2_dev_missing(c, dev);
return ca;
}
@ -354,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
int bch2_sb_member_alloc(struct bch_fs *);
#endif /* _BCACHEFS_SB_MEMBERS_H */

View File

@ -8,6 +8,11 @@
*/
#define BCH_SB_MEMBERS_MAX 64
/*
* Sentinal value - indicates a device that does not exist
*/
#define BCH_SB_MEMBER_INVALID 255
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \

View File

@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;

View File

@ -524,7 +524,7 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
/* XXX this is wrong, we need a 96 or 128 bit integer type */
c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
c->sb.time_base_lo = div64_u64(le64_to_cpu(src->time_base_lo),
c->sb.nsec_per_time_unit);
c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);

View File

@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c)
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
@ -1592,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
/* Device add/removal: */
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
struct bpos start = POS(ca->dev_idx, 0);
struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
/*
* We clear the LRU and need_discard btrees first so that we don't race
* with bch2_do_invalidates() and bch2_do_discards()
*/
ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_dev_usage_remove(c, ca->dev_idx);
bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
@ -1730,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
struct bch_dev *ca = NULL;
struct bch_sb_field_members_v2 *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
int ret;
@ -1742,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
@ -1780,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
goto err_unlock;
if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
dev_idx = c->sb.nr_devices;
goto have_slot;
}
int best = -1;
u64 best_last_mount = 0;
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
if (bch2_member_alive(&m))
continue;
u64 last_mount = le64_to_cpu(m.last_mount);
if (best < 0 || last_mount < best_last_mount) {
best = dev_idx;
best_last_mount = last_mount;
}
}
if (best >= 0) {
dev_idx = best;
goto have_slot;
}
no_slot:
ret = -BCH_ERR_ENOSPC_sb_members;
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi) {
ret = -BCH_ERR_ENOSPC_sb_members;
ret = bch2_sb_member_alloc(c);
if (ret < 0) {
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
unsigned dev_idx = ret;
/* success: */
*m = dev_mi;
m->last_mount = cpu_to_le64(ktime_get_real_seconds());
c->disk_sb.sb->nr_devices = nr_devices;
dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
*bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);

View File

@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
mutex_lock(&bc->lock);
list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
list_for_each_entry(b, &bc->live[1].list, list)
ret += btree_buf_bytes(b);
list_for_each_entry(b, &bc->freeable, list)
ret += btree_buf_bytes(b);
mutex_unlock(&bc->lock);
return ret;
}
@ -287,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_tab_rjust(out);
prt_human_readable_u64(out, nr_extents
? div_u64(sectors_uncompressed << 9, nr_extents)
? div64_u64(sectors_uncompressed << 9, nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
@ -444,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {
@ -456,7 +461,7 @@ STORE(bch2_fs)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
}
if (attr == &sysfs_trigger_gc)

View File

@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res)
*res = 1;
while (p--) {
if (*res > div_u64(U64_MAX, n))
if (*res > div64_u64(U64_MAX, n))
return -ERANGE;
*res *= n;
}
@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res)
parse_or_ret(cp, parse_unit_suffix(cp, &b));
if (v > div_u64(U64_MAX, b))
if (v > div64_u64(U64_MAX, b))
return -ERANGE;
v *= b;
if (f_n > div_u64(U64_MAX, b))
if (f_n > div64_u64(U64_MAX, b))
return -ERANGE;
f_n = div_u64(f_n * b, f_d);
f_n = div64_u64(f_n * b, f_d);
if (v + f_n < v)
return -ERANGE;
v += f_n;
@ -214,7 +214,7 @@ u64 bch2_read_flag_list(const char *opt, const char * const list[])
s = strim(d);
while ((p = strsep(&s, ","))) {
while ((p = strsep(&s, ",;"))) {
int flag = match_string(list, -1, p);
if (flag < 0) {
@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
prt_printf(out, "%llu ", div_u64(q, u->nsecs));
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;

View File

@ -13,7 +13,7 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
__u8 x_name[];
__u8 x_name[] __counted_by(x_name_len);
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */