mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-22 00:00:03 +03:00
Update bcachefs sources to ec2ddb95112b bcachefs: bch2_opts_to_text()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
f9ec00d5ca
commit
cd35891eb9
@ -1 +1 @@
|
||||
22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d
|
||||
ec2ddb95112b8967753591b16e2e439eee76c5b1
|
||||
|
@ -65,6 +65,8 @@
|
||||
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
||||
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
||||
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
|
||||
#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
|
||||
#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */
|
||||
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
||||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
|
7
include/linux/swap.h
Normal file
7
include/linux/swap.h
Normal file
@ -0,0 +1,7 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_SWAP_H
|
||||
#define _LINUX_SWAP_H
|
||||
|
||||
static inline void mm_account_reclaimed_pages(unsigned long pages) {}
|
||||
|
||||
#endif /* _LINUX_SWAP_H */
|
@ -44,6 +44,20 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
|
||||
return t;
|
||||
}
|
||||
|
||||
static inline void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
|
||||
{
|
||||
while (nsec >= NSEC_PER_SEC) {
|
||||
nsec -= NSEC_PER_SEC;
|
||||
++sec;
|
||||
}
|
||||
while (nsec < 0) {
|
||||
nsec += NSEC_PER_SEC;
|
||||
--sec;
|
||||
}
|
||||
ts->tv_sec = sec;
|
||||
ts->tv_nsec = nsec;
|
||||
}
|
||||
|
||||
#define ns_to_timespec64 ns_to_timespec
|
||||
#define timespec64_to_ns timespec_to_ns
|
||||
#define timespec64_trunc timespec_trunc
|
||||
|
@ -37,6 +37,8 @@ typedef unsigned gfp_t;
|
||||
#define __GFP_NOWARN 0
|
||||
#define __GFP_NORETRY 0
|
||||
#define __GFP_NOFAIL 0
|
||||
#define __GFP_ACCOUNT 0
|
||||
#define __GFP_RECLAIMABLE 0
|
||||
#define __GFP_ZERO 1
|
||||
#define GFP_KERNEL 2
|
||||
|
||||
|
@ -137,7 +137,7 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
|
||||
return NULL;
|
||||
|
||||
acl = allocate_dropping_locks(trans, ret,
|
||||
posix_acl_alloc(count, _gfp));
|
||||
posix_acl_alloc(count, GFP_KERNEL));
|
||||
if (!acl)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (ret) {
|
||||
@ -427,7 +427,8 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
|
||||
ret = allocate_dropping_locks_errcode(trans,
|
||||
__posix_acl_chmod(&acl, GFP_KERNEL, mode));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
|
@ -1969,8 +1969,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
||||
}
|
||||
|
||||
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
|
||||
@ -1980,18 +1980,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
|
||||
if (discard_in_flight_add(ca, bucket, false))
|
||||
return;
|
||||
|
||||
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
|
||||
return;
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
|
||||
goto put_ioref;
|
||||
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
||||
goto put_ref;
|
||||
|
||||
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
|
||||
return;
|
||||
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
||||
put_ioref:
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
put_ref:
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
||||
}
|
||||
|
||||
static int invalidate_one_bucket(struct btree_trans *trans,
|
||||
@ -2133,26 +2133,26 @@ static void bch2_do_invalidates_work(struct work_struct *work)
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
bch2_trans_put(trans);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
||||
}
|
||||
|
||||
void bch2_dev_do_invalidates(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_fs *c = ca->fs;
|
||||
|
||||
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
|
||||
return;
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
|
||||
goto put_ioref;
|
||||
if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
|
||||
goto put_ref;
|
||||
|
||||
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
|
||||
return;
|
||||
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
||||
put_ioref:
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
put_ref:
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
|
||||
}
|
||||
|
||||
void bch2_do_invalidates(struct bch_fs *c)
|
||||
@ -2298,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* device removal */
|
||||
|
||||
int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct bpos start = POS(ca->dev_idx, 0);
|
||||
struct bpos end = POS(ca->dev_idx, U64_MAX);
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* We clear the LRU and need_discard btrees first so that we don't race
|
||||
* with bch2_do_invalidates() and bch2_do_discards()
|
||||
*/
|
||||
ret = bch2_dev_remove_stripes(c, ca) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_dev_usage_remove(c, ca->dev_idx);
|
||||
bch_err_msg(c, ret, "removing dev alloc info");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Bucket IO clocks: */
|
||||
|
||||
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
|
||||
@ -2433,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
|
||||
/* device goes ro: */
|
||||
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
unsigned i;
|
||||
lockdep_assert_held(&c->state_lock);
|
||||
|
||||
/* First, remove device from allocation groups: */
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
||||
clear_bit(ca->dev_idx, c->rw_devs[i].d);
|
||||
|
||||
c->rw_devs_change_count++;
|
||||
|
||||
/*
|
||||
* Capacity is calculated based off of devices in allocation groups:
|
||||
*/
|
||||
@ -2468,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
|
||||
/* device goes rw: */
|
||||
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
unsigned i;
|
||||
lockdep_assert_held(&c->state_lock);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
||||
if (ca->mi.data_allowed & (1 << i))
|
||||
set_bit(ca->dev_idx, c->rw_devs[i].d);
|
||||
|
||||
c->rw_devs_change_count++;
|
||||
}
|
||||
|
||||
void bch2_dev_allocator_background_exit(struct bch_dev *ca)
|
||||
|
@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
|
||||
|
||||
int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
|
||||
int bch2_fs_freespace_init(struct bch_fs *);
|
||||
int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
void bch2_recalc_capacity(struct bch_fs *);
|
||||
u64 bch2_min_rw_member_capacity(struct bch_fs *);
|
||||
|
@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
|
||||
enum bch_watermark watermark,
|
||||
enum bch_data_type data_type,
|
||||
struct closure *cl,
|
||||
bool nowait,
|
||||
struct bch_dev_usage *usage)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
|
||||
struct bucket_alloc_state s = {
|
||||
.btree_bitmap = data_type == BCH_DATA_btree,
|
||||
};
|
||||
bool waiting = false;
|
||||
bool waiting = nowait;
|
||||
again:
|
||||
bch2_dev_usage_read_fast(ca, usage);
|
||||
avail = dev_buckets_free(ca, *usage, watermark);
|
||||
@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
bch2_trans_do(c, NULL, NULL, 0,
|
||||
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
|
||||
data_type, cl, &usage)));
|
||||
data_type, cl, false, &usage)));
|
||||
return ob;
|
||||
}
|
||||
|
||||
@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c,
|
||||
unsigned nr_replicas,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache,
|
||||
unsigned flags,
|
||||
struct open_bucket *ob)
|
||||
{
|
||||
unsigned durability = ob_dev(c, ob)->mi.durability;
|
||||
@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
||||
unsigned nr_replicas,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache,
|
||||
unsigned flags,
|
||||
enum bch_write_flags flags,
|
||||
enum bch_data_type data_type,
|
||||
enum bch_watermark watermark,
|
||||
struct closure *cl)
|
||||
@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
||||
continue;
|
||||
}
|
||||
|
||||
ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
|
||||
ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
|
||||
cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
|
||||
if (!IS_ERR(ob))
|
||||
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
|
||||
bch2_dev_put(ca);
|
||||
@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
||||
|
||||
if (add_new_bucket(c, ptrs, devs_may_alloc,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, flags, ob)) {
|
||||
have_cache, ob)) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache,
|
||||
enum bch_watermark watermark,
|
||||
unsigned flags,
|
||||
enum bch_write_flags flags,
|
||||
struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -883,7 +884,7 @@ got_bucket:
|
||||
|
||||
ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, flags, ob);
|
||||
have_cache, ob);
|
||||
out_put_head:
|
||||
bch2_ec_stripe_head_put(c, h);
|
||||
return ret;
|
||||
@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
|
||||
unsigned nr_replicas,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache,
|
||||
bool ec, unsigned flags)
|
||||
bool ec)
|
||||
{
|
||||
struct open_buckets ptrs_skip = { .nr = 0 };
|
||||
struct open_bucket *ob;
|
||||
@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
|
||||
have_cache, ec, ob))
|
||||
ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, flags, ob);
|
||||
have_cache, ob);
|
||||
else
|
||||
ob_push(c, &ptrs_skip, ob);
|
||||
}
|
||||
@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
|
||||
unsigned nr_replicas,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache, bool ec,
|
||||
enum bch_watermark watermark,
|
||||
unsigned flags)
|
||||
enum bch_watermark watermark)
|
||||
{
|
||||
int i, ret = 0;
|
||||
|
||||
@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
|
||||
|
||||
ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, flags, ob);
|
||||
have_cache, ob);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache,
|
||||
enum bch_watermark watermark,
|
||||
unsigned flags,
|
||||
enum bch_write_flags flags,
|
||||
struct closure *_cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -1024,13 +1024,13 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
|
||||
|
||||
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, erasure_code, flags);
|
||||
have_cache, erasure_code);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, erasure_code, watermark, flags);
|
||||
have_cache, erasure_code, watermark);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -1071,7 +1071,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
|
||||
unsigned *nr_effective,
|
||||
bool *have_cache,
|
||||
enum bch_watermark watermark,
|
||||
unsigned flags,
|
||||
enum bch_write_flags flags,
|
||||
struct closure *cl)
|
||||
{
|
||||
int ret;
|
||||
@ -1373,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
||||
unsigned nr_replicas,
|
||||
unsigned nr_replicas_required,
|
||||
enum bch_watermark watermark,
|
||||
unsigned flags,
|
||||
enum bch_write_flags flags,
|
||||
struct closure *cl,
|
||||
struct write_point **wp_ret)
|
||||
{
|
||||
@ -1389,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
|
||||
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
|
||||
erasure_code = false;
|
||||
|
||||
BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
|
||||
|
||||
BUG_ON(!nr_replicas || !nr_replicas_required);
|
||||
retry:
|
||||
ptrs.nr = 0;
|
||||
@ -1495,11 +1493,12 @@ err:
|
||||
try_decrease_writepoints(trans, write_points_nr))
|
||||
goto retry;
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
|
||||
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
|
||||
ret = -BCH_ERR_bucket_alloc_blocked;
|
||||
|
||||
if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
|
||||
bch2_err_matches(ret, BCH_ERR_freelist_empty))
|
||||
return cl
|
||||
? -BCH_ERR_bucket_alloc_blocked
|
||||
: -BCH_ERR_ENOSPC_bucket_alloc;
|
||||
ret = -BCH_ERR_bucket_alloc_blocked;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -1730,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
|
||||
nr[c->open_buckets[i].data_type]++;
|
||||
|
||||
printbuf_tabstops_reset(out);
|
||||
printbuf_tabstop_push(out, 12);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
|
||||
bch2_dev_usage_to_text(out, ca, &stats);
|
||||
|
||||
prt_newline(out);
|
||||
|
@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
|
||||
return ret;
|
||||
}
|
||||
|
||||
enum bch_write_flags;
|
||||
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
|
||||
struct dev_stripe_state *, struct bch_devs_mask *,
|
||||
unsigned, unsigned *, bool *, unsigned,
|
||||
unsigned, unsigned *, bool *, enum bch_write_flags,
|
||||
enum bch_data_type, enum bch_watermark,
|
||||
struct closure *);
|
||||
|
||||
@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
|
||||
struct bch_devs_list *,
|
||||
unsigned, unsigned,
|
||||
enum bch_watermark,
|
||||
unsigned,
|
||||
enum bch_write_flags,
|
||||
struct closure *,
|
||||
struct write_point **);
|
||||
|
||||
|
@ -3,12 +3,14 @@
|
||||
#include "bbpos.h"
|
||||
#include "alloc_background.h"
|
||||
#include "backpointers.h"
|
||||
#include "bbpos.h"
|
||||
#include "bkey_buf.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_write_buffer.h"
|
||||
#include "checksum.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "error.h"
|
||||
|
||||
#include <linux/mm.h>
|
||||
@ -750,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
|
||||
s64 mem_may_pin = mem_may_pin_bytes(c);
|
||||
int ret = 0;
|
||||
|
||||
bch2_btree_cache_unpin(c);
|
||||
|
||||
btree_interior_mask |= btree_leaf_mask;
|
||||
|
||||
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
|
||||
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
|
||||
c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
|
||||
c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
|
||||
c->btree_cache.pinned_nodes_start = start;
|
||||
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
|
||||
|
||||
@ -775,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
|
||||
BBPOS(btree, b->key.k.p);
|
||||
break;
|
||||
}
|
||||
bch2_node_pin(c, b);
|
||||
0;
|
||||
}));
|
||||
}
|
||||
@ -782,12 +787,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct progress_indicator_state {
|
||||
unsigned long next_print;
|
||||
u64 nodes_seen;
|
||||
u64 nodes_total;
|
||||
struct btree *last_node;
|
||||
};
|
||||
|
||||
static inline void progress_init(struct progress_indicator_state *s,
|
||||
struct bch_fs *c,
|
||||
u64 btree_id_mask)
|
||||
{
|
||||
memset(s, 0, sizeof(*s));
|
||||
|
||||
s->next_print = jiffies + HZ * 10;
|
||||
|
||||
for (unsigned i = 0; i < BTREE_ID_NR; i++) {
|
||||
if (!(btree_id_mask & BIT_ULL(i)))
|
||||
continue;
|
||||
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_btree,
|
||||
.btree.id = i,
|
||||
};
|
||||
|
||||
u64 v;
|
||||
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
|
||||
s->nodes_total += div64_ul(v, btree_sectors(c));
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool progress_update_p(struct progress_indicator_state *s)
|
||||
{
|
||||
bool ret = time_after_eq(jiffies, s->next_print);
|
||||
|
||||
if (ret)
|
||||
s->next_print = jiffies + HZ * 10;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void progress_update_iter(struct btree_trans *trans,
|
||||
struct progress_indicator_state *s,
|
||||
struct btree_iter *iter,
|
||||
const char *msg)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree *b = path_l(btree_iter_path(trans, iter))->b;
|
||||
|
||||
s->nodes_seen += b != s->last_node;
|
||||
s->last_node = b;
|
||||
|
||||
if (progress_update_p(s)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
unsigned percent = s->nodes_total
|
||||
? div64_u64(s->nodes_seen * 100, s->nodes_total)
|
||||
: 0;
|
||||
|
||||
prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
|
||||
msg, percent, s->nodes_seen, s->nodes_total);
|
||||
bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
|
||||
|
||||
bch_info(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
||||
struct extents_to_bp_state *s)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct progress_indicator_state progress;
|
||||
int ret = 0;
|
||||
|
||||
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
|
||||
|
||||
for (enum btree_id btree_id = 0;
|
||||
btree_id < btree_id_nr_alive(c);
|
||||
btree_id++) {
|
||||
@ -805,6 +878,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
|
||||
BTREE_ITER_prefetch);
|
||||
|
||||
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
|
||||
progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
|
||||
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
|
||||
}));
|
||||
@ -865,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
|
||||
bch2_trans_put(trans);
|
||||
bch2_bkey_buf_exit(&s.last_flushed, c);
|
||||
|
||||
c->btree_cache.pinned_nodes_leaf_mask = 0;
|
||||
c->btree_cache.pinned_nodes_interior_mask = 0;
|
||||
bch2_btree_cache_unpin(c);
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
@ -920,19 +993,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
|
||||
struct bbpos start,
|
||||
struct bbpos end)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_buf last_flushed;
|
||||
struct progress_indicator_state progress;
|
||||
|
||||
bch2_bkey_buf_init(&last_flushed);
|
||||
bkey_init(&last_flushed.k->k);
|
||||
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
|
||||
|
||||
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
|
||||
POS_MIN, BTREE_ITER_prefetch, k,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
||||
progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
|
||||
check_one_backpointer(trans, start, end,
|
||||
bkey_s_c_to_backpointer(k),
|
||||
&last_flushed));
|
||||
&last_flushed);
|
||||
}));
|
||||
|
||||
bch2_bkey_buf_exit(&last_flushed, trans->c);
|
||||
bch2_bkey_buf_exit(&last_flushed, c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -977,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
|
||||
}
|
||||
bch2_trans_put(trans);
|
||||
|
||||
c->btree_cache.pinned_nodes_leaf_mask = 0;
|
||||
c->btree_cache.pinned_nodes_interior_mask = 0;
|
||||
bch2_btree_cache_unpin(c);
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
|
@ -542,7 +542,7 @@ struct bch_dev {
|
||||
* gc_gens_lock, for device resize - holding any is sufficient for
|
||||
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
|
||||
*/
|
||||
struct bucket_array __rcu *buckets_gc;
|
||||
GENRADIX(struct bucket) buckets_gc;
|
||||
struct bucket_gens __rcu *bucket_gens;
|
||||
u8 *oldest_gen;
|
||||
unsigned long *buckets_nouse;
|
||||
@ -871,6 +871,7 @@ struct bch_fs {
|
||||
|
||||
/* ALLOCATION */
|
||||
struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
||||
unsigned long rw_devs_change_count;
|
||||
|
||||
u64 capacity; /* sectors */
|
||||
u64 reserved; /* sectors */
|
||||
@ -1045,8 +1046,6 @@ struct bch_fs {
|
||||
* for signaling to the toplevel code which pass we want to run now.
|
||||
*/
|
||||
enum bch_recovery_pass curr_recovery_pass;
|
||||
/* bitmap of explicitly enabled recovery passes: */
|
||||
u64 recovery_passes_explicit;
|
||||
/* bitmask of recovery passes that we actually ran */
|
||||
u64 recovery_passes_complete;
|
||||
/* never rewinds version of curr_recovery_pass */
|
||||
@ -1195,12 +1194,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
|
||||
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
|
||||
{
|
||||
struct timespec64 t;
|
||||
s64 sec;
|
||||
s32 rem;
|
||||
|
||||
time += c->sb.time_base_lo;
|
||||
|
||||
t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
|
||||
t.tv_nsec = rem * c->sb.nsec_per_time_unit;
|
||||
sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
|
||||
|
||||
set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
|
@ -15,11 +15,12 @@
|
||||
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
|
||||
do { \
|
||||
if (shrinker_counter) \
|
||||
bc->not_freed_##counter++; \
|
||||
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
|
||||
} while (0)
|
||||
|
||||
const char * const bch2_btree_node_flags[] = {
|
||||
@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = {
|
||||
|
||||
void bch2_recalc_btree_reserve(struct bch_fs *c)
|
||||
{
|
||||
unsigned i, reserve = 16;
|
||||
unsigned reserve = 16;
|
||||
|
||||
if (!c->btree_roots_known[0].b)
|
||||
reserve += 8;
|
||||
|
||||
for (i = 0; i < btree_id_nr_alive(c); i++) {
|
||||
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
|
||||
struct btree_root *r = bch2_btree_id_root(c, i);
|
||||
|
||||
if (r->b)
|
||||
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
|
||||
}
|
||||
|
||||
c->btree_cache.reserve = reserve;
|
||||
c->btree_cache.nr_reserve = reserve;
|
||||
}
|
||||
|
||||
static inline unsigned btree_cache_can_free(struct btree_cache *bc)
|
||||
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
|
||||
{
|
||||
return max_t(int, 0, bc->used - bc->reserve);
|
||||
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
|
||||
|
||||
size_t can_free = list->nr;
|
||||
if (!list->idx)
|
||||
can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
|
||||
return can_free;
|
||||
}
|
||||
|
||||
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
|
||||
@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
BUG_ON(btree_node_hashed(b));
|
||||
|
||||
/*
|
||||
* This should really be done in slub/vmalloc, but we're using the
|
||||
* kmalloc_large() path, so we're working around a slub bug by doing
|
||||
* this here:
|
||||
*/
|
||||
if (b->data)
|
||||
mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
|
||||
if (b->aux_data)
|
||||
mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
|
||||
|
||||
EBUG_ON(btree_node_write_in_flight(b));
|
||||
|
||||
clear_btree_node_just_written(b);
|
||||
@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
|
||||
#endif
|
||||
b->aux_data = NULL;
|
||||
|
||||
bc->used--;
|
||||
bc->nr_freeable--;
|
||||
|
||||
btree_node_to_freedlist(bc, b);
|
||||
}
|
||||
@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
||||
{
|
||||
BUG_ON(b->data || b->aux_data);
|
||||
|
||||
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
|
||||
|
||||
b->data = kvmalloc(btree_buf_bytes(b), gfp);
|
||||
if (!b->data)
|
||||
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
|
||||
@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
|
||||
|
||||
bch2_btree_lock_init(&b->c, 0);
|
||||
|
||||
bc->used++;
|
||||
bc->nr_freeable++;
|
||||
list_add(&b->list, &bc->freeable);
|
||||
return b;
|
||||
}
|
||||
@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
|
||||
six_unlock_intent(&b->c.lock);
|
||||
}
|
||||
|
||||
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
|
||||
{
|
||||
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
|
||||
|
||||
u64 mask = bc->pinned_nodes_mask[!!b->c.level];
|
||||
|
||||
return ((mask & BIT_ULL(b->c.btree_id)) &&
|
||||
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
|
||||
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
|
||||
}
|
||||
|
||||
void bch2_node_pin(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
BUG_ON(!__btree_node_pinned(bc, b));
|
||||
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
|
||||
set_btree_node_pinned(b);
|
||||
list_move(&b->list, &bc->live[1].list);
|
||||
bc->live[0].nr--;
|
||||
bc->live[1].nr++;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
|
||||
void bch2_btree_cache_unpin(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b, *n;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
c->btree_cache.pinned_nodes_mask[0] = 0;
|
||||
c->btree_cache.pinned_nodes_mask[1] = 0;
|
||||
|
||||
list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
|
||||
clear_btree_node_pinned(b);
|
||||
list_move(&b->list, &bc->live[0].list);
|
||||
bc->live[0].nr++;
|
||||
bc->live[1].nr--;
|
||||
}
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
|
||||
/* Btree in memory cache - hash table */
|
||||
|
||||
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
||||
{
|
||||
lockdep_assert_held(&bc->lock);
|
||||
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
|
||||
|
||||
BUG_ON(ret);
|
||||
@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
||||
b->hash_val = 0;
|
||||
|
||||
if (b->c.btree_id < BTREE_ID_NR)
|
||||
--bc->used_by_btree[b->c.btree_id];
|
||||
--bc->nr_by_btree[b->c.btree_id];
|
||||
|
||||
bc->live[btree_node_pinned(b)].nr--;
|
||||
bc->nr_freeable++;
|
||||
list_move(&b->list, &bc->freeable);
|
||||
}
|
||||
|
||||
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
|
||||
@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
|
||||
|
||||
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
|
||||
bch_btree_cache_params);
|
||||
if (!ret && b->c.btree_id < BTREE_ID_NR)
|
||||
bc->used_by_btree[b->c.btree_id]++;
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (b->c.btree_id < BTREE_ID_NR)
|
||||
bc->nr_by_btree[b->c.btree_id]++;
|
||||
|
||||
bool p = __btree_node_pinned(bc, b);
|
||||
mod_bit(BTREE_NODE_pinned, &b->flags, p);
|
||||
|
||||
list_move_tail(&b->list, &bc->live[p].list);
|
||||
bc->live[p].nr++;
|
||||
|
||||
bc->nr_freeable--;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
|
||||
unsigned level, enum btree_id id)
|
||||
{
|
||||
int ret;
|
||||
|
||||
b->c.level = level;
|
||||
b->c.btree_id = id;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
ret = __bch2_btree_node_hash_insert(bc, b);
|
||||
if (!ret)
|
||||
list_add_tail(&b->list, &bc->live);
|
||||
int ret = __bch2_btree_node_hash_insert(bc, b);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
return ret;
|
||||
@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_held(&bc->lock);
|
||||
|
||||
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
|
||||
|
||||
u64 mask = b->c.level
|
||||
? bc->pinned_nodes_interior_mask
|
||||
: bc->pinned_nodes_leaf_mask;
|
||||
|
||||
if ((mask & BIT_ULL(b->c.btree_id)) &&
|
||||
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
|
||||
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
|
||||
return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
||||
|
||||
wait_on_io:
|
||||
if (b->flags & ((1U << BTREE_NODE_dirty)|
|
||||
(1U << BTREE_NODE_read_in_flight)|
|
||||
@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
|
||||
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct bch_fs *c = shrink->private_data;
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree_cache_list *list = shrink->private_data;
|
||||
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
|
||||
struct btree *b, *t;
|
||||
unsigned long nr = sc->nr_to_scan;
|
||||
unsigned long can_free = 0;
|
||||
@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
||||
unsigned long touched = 0;
|
||||
unsigned i, flags;
|
||||
unsigned long ret = SHRINK_STOP;
|
||||
bool trigger_writes = atomic_read(&bc->dirty) + nr >=
|
||||
bc->used * 3 / 4;
|
||||
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
|
||||
|
||||
if (bch2_btree_shrinker_disabled)
|
||||
return SHRINK_STOP;
|
||||
@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
||||
* succeed, so that inserting keys into the btree can always succeed and
|
||||
* IO can always make forward progress:
|
||||
*/
|
||||
can_free = btree_cache_can_free(bc);
|
||||
can_free = btree_cache_can_free(list);
|
||||
nr = min_t(unsigned long, nr, can_free);
|
||||
|
||||
i = 0;
|
||||
@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
||||
six_unlock_write(&b->c.lock);
|
||||
six_unlock_intent(&b->c.lock);
|
||||
freed++;
|
||||
bc->freed++;
|
||||
bc->nr_freed++;
|
||||
}
|
||||
}
|
||||
restart:
|
||||
list_for_each_entry_safe(b, t, &bc->live, list) {
|
||||
list_for_each_entry_safe(b, t, &list->list, list) {
|
||||
touched++;
|
||||
|
||||
if (btree_node_accessed(b)) {
|
||||
clear_btree_node_accessed(b);
|
||||
bc->not_freed_access_bit++;
|
||||
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
|
||||
--touched;;
|
||||
} else if (!btree_node_reclaim(c, b, true)) {
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
|
||||
freed++;
|
||||
btree_node_data_free(c, b);
|
||||
bc->freed++;
|
||||
bc->nr_freed++;
|
||||
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
six_unlock_write(&b->c.lock);
|
||||
six_unlock_intent(&b->c.lock);
|
||||
|
||||
@ -450,7 +517,7 @@ restart:
|
||||
!btree_node_will_make_reachable(b) &&
|
||||
!btree_node_write_blocked(b) &&
|
||||
six_trylock_read(&b->c.lock)) {
|
||||
list_move(&bc->live, &b->list);
|
||||
list_move(&list->list, &b->list);
|
||||
mutex_unlock(&bc->lock);
|
||||
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
|
||||
six_unlock_read(&b->c.lock);
|
||||
@ -464,8 +531,8 @@ restart:
|
||||
break;
|
||||
}
|
||||
out_rotate:
|
||||
if (&t->list != &bc->live)
|
||||
list_move_tail(&bc->live, &t->list);
|
||||
if (&t->list != &list->list)
|
||||
list_move_tail(&list->list, &t->list);
|
||||
out:
|
||||
mutex_unlock(&bc->lock);
|
||||
out_nounlock:
|
||||
@ -478,44 +545,45 @@ out_nounlock:
|
||||
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct bch_fs *c = shrink->private_data;
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree_cache_list *list = shrink->private_data;
|
||||
|
||||
if (bch2_btree_shrinker_disabled)
|
||||
return 0;
|
||||
|
||||
return btree_cache_can_free(bc);
|
||||
return btree_cache_can_free(list);
|
||||
}
|
||||
|
||||
void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
unsigned i, flags;
|
||||
struct btree *b, *t;
|
||||
unsigned long flags;
|
||||
|
||||
shrinker_free(bc->shrink);
|
||||
shrinker_free(bc->live[1].shrink);
|
||||
shrinker_free(bc->live[0].shrink);
|
||||
|
||||
/* vfree() can allocate memory: */
|
||||
flags = memalloc_nofs_save();
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
if (c->verify_data)
|
||||
list_move(&c->verify_data->list, &bc->live);
|
||||
list_move(&c->verify_data->list, &bc->live[0].list);
|
||||
|
||||
kvfree(c->verify_ondisk);
|
||||
|
||||
for (i = 0; i < btree_id_nr_alive(c); i++) {
|
||||
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
|
||||
struct btree_root *r = bch2_btree_id_root(c, i);
|
||||
|
||||
if (r->b)
|
||||
list_add(&r->b->list, &bc->live);
|
||||
list_add(&r->b->list, &bc->live[0].list);
|
||||
}
|
||||
|
||||
list_splice(&bc->freeable, &bc->live);
|
||||
|
||||
while (!list_empty(&bc->live)) {
|
||||
b = list_first_entry(&bc->live, struct btree, list);
|
||||
list_for_each_entry_safe(b, t, &bc->live[1].list, list)
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
list_for_each_entry_safe(b, t, &bc->live[0].list, list)
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
|
||||
list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
||||
BUG_ON(btree_node_read_in_flight(b) ||
|
||||
btree_node_write_in_flight(b));
|
||||
|
||||
@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
||||
}
|
||||
|
||||
BUG_ON(!bch2_journal_error(&c->journal) &&
|
||||
atomic_read(&c->btree_cache.dirty));
|
||||
atomic_long_read(&c->btree_cache.nr_dirty));
|
||||
|
||||
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
|
||||
|
||||
while (!list_empty(&bc->freed_nonpcpu)) {
|
||||
b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
|
||||
list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
|
||||
list_del(&b->list);
|
||||
six_lock_exit(&b->c.lock);
|
||||
kfree(b);
|
||||
@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
||||
mutex_unlock(&bc->lock);
|
||||
memalloc_nofs_restore(flags);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
|
||||
BUG_ON(bc->nr_by_btree[i]);
|
||||
BUG_ON(bc->live[0].nr);
|
||||
BUG_ON(bc->live[1].nr);
|
||||
BUG_ON(bc->nr_freeable);
|
||||
|
||||
if (bc->table_init_done)
|
||||
rhashtable_destroy(&bc->table);
|
||||
}
|
||||
@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
|
||||
|
||||
bch2_recalc_btree_reserve(c);
|
||||
|
||||
for (i = 0; i < bc->reserve; i++)
|
||||
for (i = 0; i < bc->nr_reserve; i++)
|
||||
if (!__bch2_btree_node_mem_alloc(c))
|
||||
goto err;
|
||||
|
||||
list_splice_init(&bc->live, &bc->freeable);
|
||||
list_splice_init(&bc->live[0].list, &bc->freeable);
|
||||
|
||||
mutex_init(&c->verify_lock);
|
||||
|
||||
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
|
||||
if (!shrink)
|
||||
goto err;
|
||||
bc->shrink = shrink;
|
||||
bc->live[0].shrink = shrink;
|
||||
shrink->count_objects = bch2_btree_cache_count;
|
||||
shrink->scan_objects = bch2_btree_cache_scan;
|
||||
shrink->seeks = 4;
|
||||
shrink->private_data = c;
|
||||
shrink->seeks = 2;
|
||||
shrink->private_data = &bc->live[0];
|
||||
shrinker_register(shrink);
|
||||
|
||||
shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
|
||||
if (!shrink)
|
||||
goto err;
|
||||
bc->live[1].shrink = shrink;
|
||||
shrink->count_objects = bch2_btree_cache_count;
|
||||
shrink->scan_objects = bch2_btree_cache_scan;
|
||||
shrink->seeks = 8;
|
||||
shrink->private_data = &bc->live[1];
|
||||
shrinker_register(shrink);
|
||||
|
||||
return 0;
|
||||
@ -582,7 +665,10 @@ err:
|
||||
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
|
||||
{
|
||||
mutex_init(&bc->lock);
|
||||
INIT_LIST_HEAD(&bc->live);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
|
||||
bc->live[i].idx = i;
|
||||
INIT_LIST_HEAD(&bc->live[i].list);
|
||||
}
|
||||
INIT_LIST_HEAD(&bc->freeable);
|
||||
INIT_LIST_HEAD(&bc->freed_pcpu);
|
||||
INIT_LIST_HEAD(&bc->freed_nonpcpu);
|
||||
@ -644,12 +730,14 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
|
||||
list_for_each_entry_reverse(b, &bc->live, list)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
|
||||
list_for_each_entry_reverse(b, &bc->live[i].list, list)
|
||||
if (!btree_node_reclaim(c, b, false))
|
||||
return b;
|
||||
|
||||
while (1) {
|
||||
list_for_each_entry_reverse(b, &bc->live, list)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
|
||||
list_for_each_entry_reverse(b, &bc->live[i].list, list)
|
||||
if (!btree_node_write_and_reclaim(c, b))
|
||||
return b;
|
||||
|
||||
@ -716,14 +804,15 @@ got_node:
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
|
||||
if (memalloc_flags_do(PF_MEMALLOC_NORECLAIM,
|
||||
btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))) {
|
||||
bch2_trans_unlock(trans);
|
||||
if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
|
||||
goto err;
|
||||
}
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
bc->used++;
|
||||
bc->nr_freeable++;
|
||||
got_mem:
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
@ -1264,8 +1353,8 @@ wait_on_io:
|
||||
BUG_ON(btree_node_dirty(b));
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
btree_node_data_free(c, b);
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
btree_node_data_free(c, b);
|
||||
mutex_unlock(&bc->lock);
|
||||
out:
|
||||
six_unlock_write(&b->c.lock);
|
||||
@ -1337,13 +1426,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
|
||||
}
|
||||
|
||||
static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
|
||||
const char *label, unsigned nr)
|
||||
const char *label, size_t nr)
|
||||
{
|
||||
prt_printf(out, "%s\t", label);
|
||||
prt_human_readable_u64(out, nr * c->opts.btree_node_size);
|
||||
prt_printf(out, " (%u)\n", nr);
|
||||
prt_printf(out, " (%zu)\n", nr);
|
||||
}
|
||||
|
||||
static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
|
||||
#define x(n) #n,
|
||||
BCH_BTREE_CACHE_NOT_FREED_REASONS()
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
|
||||
@ -1351,24 +1447,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
|
||||
if (!out->nr_tabstops)
|
||||
printbuf_tabstop_push(out, 32);
|
||||
|
||||
prt_btree_cache_line(out, c, "total:", bc->used);
|
||||
prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
|
||||
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
|
||||
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
|
||||
prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
|
||||
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
|
||||
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
|
||||
prt_newline(out);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
|
||||
prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
|
||||
prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
|
||||
|
||||
prt_newline(out);
|
||||
prt_printf(out, "freed:\t%u\n", bc->freed);
|
||||
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
|
||||
prt_printf(out, "not freed:\n");
|
||||
prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
|
||||
prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
|
||||
prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
|
||||
prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
|
||||
prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
|
||||
prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
|
||||
prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
|
||||
prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
|
||||
prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
|
||||
prt_printf(out, " %s\t%llu\n",
|
||||
bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
|
||||
}
|
||||
|
@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
|
||||
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
|
||||
unsigned, enum btree_id);
|
||||
|
||||
void bch2_node_pin(struct bch_fs *, struct btree *);
|
||||
void bch2_btree_cache_unpin(struct bch_fs *);
|
||||
|
||||
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
|
||||
struct bkey_s_c, struct bkey_i *);
|
||||
|
||||
|
@ -549,9 +549,8 @@ reconstruct_root:
|
||||
six_unlock_read(&b->c.lock);
|
||||
|
||||
if (ret == DROP_THIS_NODE) {
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_move(&b->list, &c->btree_cache.freeable);
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
r->b = NULL;
|
||||
@ -753,10 +752,8 @@ static void bch2_gc_free(struct bch_fs *c)
|
||||
genradix_free(&c->reflink_gc_table);
|
||||
genradix_free(&c->gc_stripes);
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
|
||||
ca->buckets_gc = NULL;
|
||||
}
|
||||
for_each_member_device(c, ca)
|
||||
genradix_free(&ca->buckets_gc);
|
||||
}
|
||||
|
||||
static int bch2_gc_start(struct bch_fs *c)
|
||||
@ -910,20 +907,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
|
||||
int ret = 0;
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
|
||||
ca->mi.nbuckets * sizeof(struct bucket),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!buckets) {
|
||||
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
|
||||
if (ret) {
|
||||
bch2_dev_put(ca);
|
||||
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
|
||||
break;
|
||||
}
|
||||
|
||||
buckets->first_bucket = ca->mi.first_bucket;
|
||||
buckets->nbuckets = ca->mi.nbuckets;
|
||||
buckets->nbuckets_minus_first =
|
||||
buckets->nbuckets - buckets->first_bucket;
|
||||
rcu_assign_pointer(ca->buckets_gc, buckets);
|
||||
}
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
|
@ -1666,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
|
||||
bch2_btree_pos_to_text(&buf, c, b);
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
|
||||
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
|
||||
if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
|
||||
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
|
||||
bch2_fatal_error(c);
|
||||
|
||||
@ -1749,10 +1749,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
|
||||
bch2_btree_node_read(trans, b, true);
|
||||
|
||||
if (btree_node_read_error(b)) {
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_move(&b->list, &c->btree_cache.freeable);
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
ret = -BCH_ERR_btree_node_read_error;
|
||||
@ -2031,7 +2029,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
|
||||
do_write:
|
||||
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
|
||||
|
||||
atomic_dec(&c->btree_cache.dirty);
|
||||
atomic_long_dec(&c->btree_cache.nr_dirty);
|
||||
|
||||
BUG_ON(btree_node_fake(b));
|
||||
BUG_ON((b->will_make_reachable != 0) != !b->written);
|
||||
|
@ -18,13 +18,13 @@ struct btree_node_read_all;
|
||||
static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
|
||||
atomic_inc(&c->btree_cache.dirty);
|
||||
atomic_long_inc(&c->btree_cache.nr_dirty);
|
||||
}
|
||||
|
||||
static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
|
||||
atomic_dec(&c->btree_cache.dirty);
|
||||
atomic_long_dec(&c->btree_cache.nr_dirty);
|
||||
}
|
||||
|
||||
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
|
||||
|
@ -6,6 +6,8 @@
|
||||
#include "btree_types.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
|
||||
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
|
||||
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
|
||||
@ -529,6 +531,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *);
|
||||
|
||||
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
|
||||
|
||||
/**
|
||||
* bch2_trans_kmalloc - allocate memory for use by the current transaction
|
||||
*
|
||||
* Must be called after bch2_trans_begin, which on second and further calls
|
||||
* frees all memory allocated in this transaction
|
||||
*/
|
||||
static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
||||
{
|
||||
size = roundup(size, 8);
|
||||
@ -865,13 +873,19 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
||||
(_do) ?: bch2_trans_relock(_trans); \
|
||||
})
|
||||
|
||||
#define memalloc_flags_do(_flags, _do) \
|
||||
({ \
|
||||
unsigned _saved_flags = memalloc_flags_save(_flags); \
|
||||
typeof(_do) _ret = _do; \
|
||||
memalloc_noreclaim_restore(_saved_flags); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
#define allocate_dropping_locks_errcode(_trans, _do) \
|
||||
({ \
|
||||
gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
|
||||
int _ret = _do; \
|
||||
int _ret = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\
|
||||
\
|
||||
if (bch2_err_matches(_ret, ENOMEM)) { \
|
||||
_gfp = GFP_KERNEL; \
|
||||
_ret = drop_locks_do(_trans, _do); \
|
||||
} \
|
||||
_ret; \
|
||||
@ -879,12 +893,10 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
||||
|
||||
#define allocate_dropping_locks(_trans, _ret, _do) \
|
||||
({ \
|
||||
gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
|
||||
typeof(_do) _p = _do; \
|
||||
typeof(_do) _p = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\
|
||||
\
|
||||
_ret = 0; \
|
||||
if (unlikely(!_p)) { \
|
||||
_gfp = GFP_KERNEL; \
|
||||
_ret = drop_locks_do(_trans, ((_p = _do), 0)); \
|
||||
} \
|
||||
_p; \
|
||||
|
@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys)
|
||||
{
|
||||
sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
|
||||
|
||||
cond_resched();
|
||||
|
||||
struct journal_key *dst = keys->data;
|
||||
|
||||
darray_for_each(*keys, src) {
|
||||
|
@ -116,8 +116,10 @@ static void bkey_cached_free(struct btree_key_cache *bc,
|
||||
this_cpu_inc(*bc->nr_pending);
|
||||
}
|
||||
|
||||
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
|
||||
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL|__GFP_ACCOUNT|__GFP_RECLAIMABLE;
|
||||
|
||||
struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
|
||||
if (unlikely(!ck))
|
||||
return NULL;
|
||||
@ -145,7 +147,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
|
||||
goto lock;
|
||||
|
||||
ck = allocate_dropping_locks(trans, ret,
|
||||
__bkey_cached_alloc(key_u64s, _gfp));
|
||||
__bkey_cached_alloc(key_u64s));
|
||||
if (ret) {
|
||||
if (ck)
|
||||
kfree(ck->k);
|
||||
@ -239,7 +241,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
|
||||
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
|
||||
kmalloc(key_u64s * sizeof(u64), _gfp));
|
||||
kmalloc(key_u64s * sizeof(u64), GFP_KERNEL));
|
||||
if (unlikely(!new_k)) {
|
||||
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
|
||||
bch2_btree_id_str(ck->key.btree_id), key_u64s);
|
||||
|
@ -138,6 +138,31 @@ struct btree {
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
|
||||
x(lock_intent) \
|
||||
x(lock_write) \
|
||||
x(dirty) \
|
||||
x(read_in_flight) \
|
||||
x(write_in_flight) \
|
||||
x(noevict) \
|
||||
x(write_blocked) \
|
||||
x(will_make_reachable) \
|
||||
x(access_bit)
|
||||
|
||||
enum bch_btree_cache_not_freed_reasons {
|
||||
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
|
||||
BCH_BTREE_CACHE_NOT_FREED_REASONS()
|
||||
#undef x
|
||||
BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
|
||||
};
|
||||
|
||||
struct btree_cache_list {
|
||||
unsigned idx;
|
||||
struct shrinker *shrink;
|
||||
struct list_head list;
|
||||
size_t nr;
|
||||
};
|
||||
|
||||
struct btree_cache {
|
||||
struct rhashtable table;
|
||||
bool table_init_done;
|
||||
@ -155,28 +180,19 @@ struct btree_cache {
|
||||
* should never grow past ~2-3 nodes in practice.
|
||||
*/
|
||||
struct mutex lock;
|
||||
struct list_head live;
|
||||
struct list_head freeable;
|
||||
struct list_head freed_pcpu;
|
||||
struct list_head freed_nonpcpu;
|
||||
struct btree_cache_list live[2];
|
||||
|
||||
/* Number of elements in live + freeable lists */
|
||||
unsigned used;
|
||||
unsigned reserve;
|
||||
unsigned freed;
|
||||
unsigned not_freed_lock_intent;
|
||||
unsigned not_freed_lock_write;
|
||||
unsigned not_freed_dirty;
|
||||
unsigned not_freed_read_in_flight;
|
||||
unsigned not_freed_write_in_flight;
|
||||
unsigned not_freed_noevict;
|
||||
unsigned not_freed_write_blocked;
|
||||
unsigned not_freed_will_make_reachable;
|
||||
unsigned not_freed_access_bit;
|
||||
atomic_t dirty;
|
||||
struct shrinker *shrink;
|
||||
size_t nr_freeable;
|
||||
size_t nr_reserve;
|
||||
size_t nr_by_btree[BTREE_ID_NR];
|
||||
atomic_long_t nr_dirty;
|
||||
|
||||
unsigned used_by_btree[BTREE_ID_NR];
|
||||
/* shrinker stats */
|
||||
size_t nr_freed;
|
||||
u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
|
||||
|
||||
/*
|
||||
* If we need to allocate memory for a new btree node and that
|
||||
@ -189,8 +205,8 @@ struct btree_cache {
|
||||
|
||||
struct bbpos pinned_nodes_start;
|
||||
struct bbpos pinned_nodes_end;
|
||||
u64 pinned_nodes_leaf_mask;
|
||||
u64 pinned_nodes_interior_mask;
|
||||
/* btree id mask: 0 for leaves, 1 for interior */
|
||||
u64 pinned_nodes_mask[2];
|
||||
};
|
||||
|
||||
struct btree_node_iter {
|
||||
@ -582,7 +598,8 @@ enum btree_write_type {
|
||||
x(dying) \
|
||||
x(fake) \
|
||||
x(need_rewrite) \
|
||||
x(never_write)
|
||||
x(never_write) \
|
||||
x(pinned)
|
||||
|
||||
enum btree_flags {
|
||||
/* First bits for btree node write type */
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "clock.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "io_write.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
@ -145,7 +146,7 @@ fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
topology_repair:
|
||||
if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
|
||||
if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
|
||||
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
|
||||
bch2_inconsistent_error(c);
|
||||
ret = -BCH_ERR_btree_need_topology_repair;
|
||||
@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
|
||||
unsigned i, level = b->c.level;
|
||||
|
||||
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
|
||||
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
__btree_node_free(trans, b);
|
||||
|
||||
six_unlock_write(&b->c.lock);
|
||||
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
|
||||
|
||||
@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
|
||||
clear_btree_node_need_write(b);
|
||||
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_del_init(&b->list);
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
|
||||
six_unlock_intent(&n->c.lock);
|
||||
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_add_tail(&b->list, &c->btree_cache.live);
|
||||
list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
bch2_trans_verify_locks(trans);
|
||||
|
@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out,
|
||||
struct bch_dev *ca,
|
||||
struct bch_dev_usage *usage)
|
||||
{
|
||||
if (out->nr_tabstops < 5) {
|
||||
printbuf_tabstops_reset(out);
|
||||
printbuf_tabstop_push(out, 12);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
printbuf_tabstop_push(out, 16);
|
||||
}
|
||||
|
||||
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
|
||||
|
||||
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
|
||||
@ -100,7 +109,8 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
|
||||
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
|
||||
if (!ca) {
|
||||
if (fsck_err(trans, ptr_to_invalid_device,
|
||||
if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
|
||||
trans, ptr_to_invalid_device,
|
||||
"pointer to missing device %u\n"
|
||||
"while marking %s",
|
||||
p.ptr.dev,
|
||||
@ -476,7 +486,7 @@ out:
|
||||
return ret;
|
||||
err:
|
||||
bch2_dump_trans_updates(trans);
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_bucket_ref_update;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -562,8 +572,8 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
|
||||
if (unlikely(!ca)) {
|
||||
if (insert)
|
||||
ret = -EIO;
|
||||
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
|
||||
ret = -BCH_ERR_trigger_pointer;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -592,7 +602,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
||||
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
|
||||
p.ptr.dev,
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_trigger_pointer;
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
@ -637,7 +647,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
|
||||
bch2_trans_inconsistent(trans,
|
||||
"stripe pointer doesn't match stripe %llu",
|
||||
(u64) p.ec.idx);
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_trigger_stripe_pointer;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -676,7 +686,7 @@ err:
|
||||
(u64) p.ec.idx, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
bch2_inconsistent_error(c);
|
||||
return -EIO;
|
||||
return -BCH_ERR_trigger_stripe_pointer;
|
||||
}
|
||||
|
||||
m->block_sectors[p.ec.block] += sectors;
|
||||
@ -740,7 +750,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
return ret;
|
||||
} else if (!p.has_ec) {
|
||||
*replicas_sectors += disk_sectors;
|
||||
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
|
||||
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
|
||||
} else {
|
||||
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
|
||||
if (ret)
|
||||
@ -876,7 +886,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
|
||||
need_rebalance_delta -= s != 0;
|
||||
need_rebalance_sectors_delta -= s;
|
||||
|
||||
s = bch2_bkey_sectors_need_rebalance(c, old);
|
||||
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
|
||||
need_rebalance_delta += s != 0;
|
||||
need_rebalance_sectors_delta += s;
|
||||
|
||||
@ -956,7 +966,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
||||
bch2_data_type_str(a->v.data_type),
|
||||
bch2_data_type_str(type),
|
||||
bch2_data_type_str(type));
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_metadata_bucket_inconsistency;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -1012,7 +1022,7 @@ err:
|
||||
bucket_unlock(g);
|
||||
err_unlock:
|
||||
percpu_up_read(&c->mark_lock);
|
||||
return -EIO;
|
||||
return -BCH_ERR_metadata_bucket_inconsistency;
|
||||
}
|
||||
|
||||
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
||||
|
@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
|
||||
static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
|
||||
{
|
||||
return rcu_dereference_check(ca->buckets_gc,
|
||||
!ca->fs ||
|
||||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
|
||||
lockdep_is_held(&ca->fs->state_lock) ||
|
||||
lockdep_is_held(&ca->bucket_lock));
|
||||
}
|
||||
|
||||
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
|
||||
{
|
||||
struct bucket_array *buckets = gc_bucket_array(ca);
|
||||
|
||||
if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
|
||||
return NULL;
|
||||
return buckets->b + b;
|
||||
return genradix_ptr(&ca->buckets_gc, b);
|
||||
}
|
||||
|
||||
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
|
||||
|
@ -19,14 +19,6 @@ struct bucket {
|
||||
u32 stripe_sectors;
|
||||
} __aligned(sizeof(long));
|
||||
|
||||
struct bucket_array {
|
||||
struct rcu_head rcu;
|
||||
u16 first_bucket;
|
||||
size_t nbuckets;
|
||||
size_t nbuckets_minus_first;
|
||||
struct bucket b[] __counted_by(nbuckets);
|
||||
};
|
||||
|
||||
struct bucket_gens {
|
||||
struct rcu_head rcu;
|
||||
u16 first_bucket;
|
||||
|
@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
|
||||
struct scatterlist *sg, size_t len)
|
||||
{
|
||||
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
|
||||
int ret;
|
||||
|
||||
skcipher_request_set_sync_tfm(req, tfm);
|
||||
skcipher_request_set_callback(req, 0, NULL, NULL);
|
||||
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
|
||||
|
||||
ret = crypto_skcipher_encrypt(req);
|
||||
int ret = crypto_skcipher_encrypt(req);
|
||||
if (ret)
|
||||
pr_err("got error %i from crypto_skcipher_encrypt()", ret);
|
||||
|
||||
@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
|
||||
void *buf, size_t len)
|
||||
{
|
||||
if (!is_vmalloc_addr(buf)) {
|
||||
struct scatterlist sg;
|
||||
struct scatterlist sg = {};
|
||||
|
||||
sg_init_table(&sg, 1);
|
||||
sg_set_page(&sg,
|
||||
is_vmalloc_addr(buf)
|
||||
? vmalloc_to_page(buf)
|
||||
: virt_to_page(buf),
|
||||
len, offset_in_page(buf));
|
||||
sg_mark_end(&sg);
|
||||
sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
|
||||
return do_encrypt_sg(tfm, nonce, &sg, len);
|
||||
} else {
|
||||
unsigned pages = buf_pages(buf, len);
|
||||
struct scatterlist *sg;
|
||||
size_t orig_len = len;
|
||||
int ret, i;
|
||||
DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
|
||||
size_t sgl_len = 0;
|
||||
int ret;
|
||||
|
||||
sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
|
||||
if (!sg)
|
||||
return -BCH_ERR_ENOMEM_do_encrypt;
|
||||
darray_init(&sgl);
|
||||
|
||||
sg_init_table(sg, pages);
|
||||
|
||||
for (i = 0; i < pages; i++) {
|
||||
while (len) {
|
||||
unsigned offset = offset_in_page(buf);
|
||||
unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
|
||||
struct scatterlist sg = {
|
||||
.page_link = (unsigned long) vmalloc_to_page(buf),
|
||||
.offset = offset,
|
||||
.length = min(len, PAGE_SIZE - offset),
|
||||
};
|
||||
|
||||
sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
|
||||
buf += pg_len;
|
||||
len -= pg_len;
|
||||
if (darray_push(&sgl, sg)) {
|
||||
sg_mark_end(&darray_last(sgl));
|
||||
ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
nonce = nonce_add(nonce, sgl_len);
|
||||
sgl_len = 0;
|
||||
sgl.nr = 0;
|
||||
BUG_ON(darray_push(&sgl, sg));
|
||||
}
|
||||
|
||||
ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
|
||||
kfree(sg);
|
||||
buf += sg.length;
|
||||
len -= sg.length;
|
||||
sgl_len += sg.length;
|
||||
}
|
||||
|
||||
sg_mark_end(&darray_last(sgl));
|
||||
ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
|
||||
err:
|
||||
darray_exit(&sgl);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
{
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
struct scatterlist sgl[16], *sg = sgl;
|
||||
size_t bytes = 0;
|
||||
DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
|
||||
size_t sgl_len = 0;
|
||||
int ret = 0;
|
||||
|
||||
if (!bch2_csum_type_is_encryption(type))
|
||||
return 0;
|
||||
|
||||
sg_init_table(sgl, ARRAY_SIZE(sgl));
|
||||
darray_init(&sgl);
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
if (sg == sgl + ARRAY_SIZE(sgl)) {
|
||||
sg_mark_end(sg - 1);
|
||||
struct scatterlist sg = {
|
||||
.page_link = (unsigned long) bv.bv_page,
|
||||
.offset = bv.bv_offset,
|
||||
.length = bv.bv_len,
|
||||
};
|
||||
|
||||
ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
||||
if (darray_push(&sgl, sg)) {
|
||||
sg_mark_end(&darray_last(sgl));
|
||||
ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto err;
|
||||
|
||||
nonce = nonce_add(nonce, bytes);
|
||||
bytes = 0;
|
||||
nonce = nonce_add(nonce, sgl_len);
|
||||
sgl_len = 0;
|
||||
sgl.nr = 0;
|
||||
|
||||
sg_init_table(sgl, ARRAY_SIZE(sgl));
|
||||
sg = sgl;
|
||||
BUG_ON(darray_push(&sgl, sg));
|
||||
}
|
||||
|
||||
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
|
||||
bytes += bv.bv_len;
|
||||
}
|
||||
|
||||
if (sg != sgl) {
|
||||
sg_mark_end(sg - 1);
|
||||
return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
||||
sgl_len += sg.length;
|
||||
}
|
||||
|
||||
sg_mark_end(&darray_last(sgl));
|
||||
ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
|
||||
err:
|
||||
darray_exit(&sgl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -337,6 +337,7 @@ restart_drop_extra_replicas:
|
||||
printbuf_exit(&buf);
|
||||
|
||||
bch2_fatal_error(c);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -570,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
|
||||
while (data_opts.kill_ptrs) {
|
||||
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
|
||||
|
||||
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
|
||||
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
|
||||
data_opts.kill_ptrs ^= 1U << drop;
|
||||
}
|
||||
|
||||
|
456
libbcachefs/ec.c
456
libbcachefs/ec.c
@ -18,6 +18,7 @@
|
||||
#include "ec.h"
|
||||
#include "error.h"
|
||||
#include "io_read.h"
|
||||
#include "io_write.h"
|
||||
#include "keylist.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
bch2_prt_csum_type(out, s.csum_type);
|
||||
prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
|
||||
|
||||
if (s.disk_label) {
|
||||
prt_str(out, " label");
|
||||
bch2_disk_path_to_text(out, c, s.disk_label - 1);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < s.nr_blocks; i++) {
|
||||
const struct bch_extent_ptr *ptr = sp->ptrs + i;
|
||||
|
||||
if ((void *) ptr >= bkey_val_end(k))
|
||||
break;
|
||||
|
||||
prt_char(out, ' ');
|
||||
bch2_extent_ptr_to_text(out, c, ptr);
|
||||
|
||||
if (s.csum_type < BCH_CSUM_NR &&
|
||||
@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
|
||||
a->dirty_sectors,
|
||||
a->stripe, s.k->p.offset,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
|
||||
a->dirty_sectors,
|
||||
a->cached_sectors,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err;
|
||||
}
|
||||
} else {
|
||||
@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
|
||||
bucket.inode, bucket.offset, a->gen,
|
||||
a->stripe,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
|
||||
bch2_data_type_str(a->data_type),
|
||||
bch2_data_type_str(data_type),
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
|
||||
a->dirty_sectors,
|
||||
a->cached_sectors,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
||||
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
|
||||
if (unlikely(!ca)) {
|
||||
if (!(flags & BTREE_TRIGGER_overwrite))
|
||||
ret = -EIO;
|
||||
if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
||||
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
|
||||
ptr->dev,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
|
||||
{
|
||||
m->sectors = le16_to_cpu(s->sectors);
|
||||
m->algorithm = s->algorithm;
|
||||
m->nr_blocks = s->nr_blocks;
|
||||
m->nr_redundant = s->nr_redundant;
|
||||
m->disk_label = s->disk_label;
|
||||
m->blocks_nonempty = 0;
|
||||
|
||||
for (unsigned i = 0; i < s->nr_blocks; i++)
|
||||
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
|
||||
}
|
||||
|
||||
int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
enum btree_id btree, unsigned level,
|
||||
struct bkey_s_c old, struct bkey_s _new,
|
||||
@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
|
||||
|
||||
memset(m, 0, sizeof(*m));
|
||||
} else {
|
||||
m->sectors = le16_to_cpu(new_s->sectors);
|
||||
m->algorithm = new_s->algorithm;
|
||||
m->nr_blocks = new_s->nr_blocks;
|
||||
m->nr_redundant = new_s->nr_redundant;
|
||||
m->blocks_nonempty = 0;
|
||||
|
||||
for (unsigned i = 0; i < new_s->nr_blocks; i++)
|
||||
m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
|
||||
stripe_to_mem(m, new_s);
|
||||
|
||||
if (!old_s)
|
||||
bch2_stripes_heap_insert(c, m, idx);
|
||||
@ -816,13 +829,15 @@ err:
|
||||
}
|
||||
|
||||
/* recovery read path: */
|
||||
int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
|
||||
int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
|
||||
struct bkey_s_c orig_k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct ec_stripe_buf *buf;
|
||||
struct ec_stripe_buf *buf = NULL;
|
||||
struct closure cl;
|
||||
struct bch_stripe *v;
|
||||
unsigned i, offset;
|
||||
const char *msg = NULL;
|
||||
int ret = 0;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
@ -835,32 +850,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
|
||||
|
||||
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
|
||||
if (ret) {
|
||||
bch_err_ratelimited(c,
|
||||
"error doing reconstruct read: error %i looking up stripe", ret);
|
||||
kfree(buf);
|
||||
return -EIO;
|
||||
msg = "stripe not found";
|
||||
goto err;
|
||||
}
|
||||
|
||||
v = &bkey_i_to_stripe(&buf->key)->v;
|
||||
|
||||
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
|
||||
bch_err_ratelimited(c,
|
||||
"error doing reconstruct read: pointer doesn't match stripe");
|
||||
ret = -EIO;
|
||||
msg = "pointer doesn't match stripe";
|
||||
goto err;
|
||||
}
|
||||
|
||||
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
|
||||
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
|
||||
bch_err_ratelimited(c,
|
||||
"error doing reconstruct read: read is bigger than stripe");
|
||||
ret = -EIO;
|
||||
msg = "read is bigger than stripe";
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
|
||||
if (ret)
|
||||
if (ret) {
|
||||
msg = "-ENOMEM";
|
||||
goto err;
|
||||
}
|
||||
|
||||
for (i = 0; i < v->nr_blocks; i++)
|
||||
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
|
||||
@ -868,9 +879,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
|
||||
closure_sync(&cl);
|
||||
|
||||
if (ec_nr_failed(buf) > v->nr_redundant) {
|
||||
bch_err_ratelimited(c,
|
||||
"error doing reconstruct read: unable to read enough blocks");
|
||||
ret = -EIO;
|
||||
msg = "unable to read enough blocks";
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -882,20 +891,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
|
||||
|
||||
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
|
||||
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
|
||||
err:
|
||||
out:
|
||||
ec_stripe_buf_exit(buf);
|
||||
kfree(buf);
|
||||
return ret;
|
||||
err:
|
||||
struct printbuf msgbuf = PRINTBUF;
|
||||
bch2_bkey_val_to_text(&msgbuf, c, orig_k);
|
||||
bch_err_ratelimited(c,
|
||||
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
|
||||
printbuf_exit(&msgbuf);;
|
||||
ret = -BCH_ERR_stripe_reconstruct;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* stripe bucket accounting: */
|
||||
|
||||
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
|
||||
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx)
|
||||
{
|
||||
ec_stripes_heap n, *h = &c->ec_stripes_heap;
|
||||
|
||||
if (idx >= h->size) {
|
||||
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
|
||||
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), GFP_KERNEL))
|
||||
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
||||
|
||||
mutex_lock(&c->ec_stripes_heap_lock);
|
||||
@ -909,11 +926,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
|
||||
free_heap(&n);
|
||||
}
|
||||
|
||||
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
|
||||
if (!genradix_ptr_alloc(&c->stripes, idx, GFP_KERNEL))
|
||||
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
||||
|
||||
if (c->gc_pos.phase != GC_PHASE_not_running &&
|
||||
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
|
||||
!genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL))
|
||||
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
|
||||
|
||||
return 0;
|
||||
@ -923,7 +940,7 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
|
||||
struct btree_iter *iter)
|
||||
{
|
||||
return allocate_dropping_locks_errcode(trans,
|
||||
__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
|
||||
__ec_stripe_mem_alloc(trans->c, iter->pos.offset));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
|
||||
|
||||
bkey_reassemble(n, k);
|
||||
|
||||
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
|
||||
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
|
||||
ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
|
||||
BUG_ON(!ec_ptr);
|
||||
|
||||
@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
|
||||
}
|
||||
|
||||
static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
{
|
||||
struct ec_stripe_new *s = h->s;
|
||||
|
||||
lockdep_assert_held(&h->lock);
|
||||
|
||||
BUG_ON(!s->allocated && !s->err);
|
||||
|
||||
h->s = NULL;
|
||||
@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
ec_stripe_new_put(c, s, STRIPE_REF_io);
|
||||
}
|
||||
|
||||
static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
|
||||
{
|
||||
h->s->err = err;
|
||||
ec_stripe_new_set_pending(c, h);
|
||||
}
|
||||
|
||||
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
struct ec_stripe_new *s = ob->ec;
|
||||
@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
|
||||
struct bkey_i *k,
|
||||
unsigned nr_data,
|
||||
unsigned nr_parity,
|
||||
unsigned stripe_size)
|
||||
unsigned stripe_size,
|
||||
unsigned disk_label)
|
||||
{
|
||||
struct bkey_i_stripe *s = bkey_stripe_init(k);
|
||||
unsigned u64s;
|
||||
@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
|
||||
s->v.nr_redundant = nr_parity;
|
||||
s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
|
||||
s->v.csum_type = BCH_CSUM_crc32c;
|
||||
s->v.pad = 0;
|
||||
s->v.disk_label = disk_label;
|
||||
|
||||
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
|
||||
BUG_ON(1 << s->v.csum_granularity_bits >=
|
||||
@ -1685,14 +1711,65 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
s->nr_parity = h->redundancy;
|
||||
|
||||
ec_stripe_key_init(c, &s->new_stripe.key,
|
||||
s->nr_data, s->nr_parity, h->blocksize);
|
||||
s->nr_data, s->nr_parity,
|
||||
h->blocksize, h->disk_label);
|
||||
|
||||
h->s = s;
|
||||
h->nr_created++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
{
|
||||
rcu_read_lock();
|
||||
h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
|
||||
? group_to_target(h->disk_label - 1)
|
||||
: 0);
|
||||
unsigned nr_devs = dev_mask_nr(&h->devs);
|
||||
|
||||
for_each_member_device_rcu(c, ca, &h->devs)
|
||||
if (!ca->mi.durability)
|
||||
__clear_bit(ca->dev_idx, h->devs.d);
|
||||
unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
|
||||
|
||||
h->blocksize = pick_blocksize(c, &h->devs);
|
||||
|
||||
h->nr_active_devs = 0;
|
||||
for_each_member_device_rcu(c, ca, &h->devs)
|
||||
if (ca->mi.bucket_size == h->blocksize)
|
||||
h->nr_active_devs++;
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* If we only have redundancy + 1 devices, we're better off with just
|
||||
* replication:
|
||||
*/
|
||||
h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
|
||||
|
||||
if (h->insufficient_devs) {
|
||||
const char *err;
|
||||
|
||||
if (nr_devs < h->redundancy + 2)
|
||||
err = NULL;
|
||||
else if (nr_devs_with_durability < h->redundancy + 2)
|
||||
err = "cannot use durability=0 devices";
|
||||
else
|
||||
err = "mismatched bucket sizes";
|
||||
|
||||
if (err)
|
||||
bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
|
||||
h->nr_active_devs, h->redundancy + 2, err);
|
||||
}
|
||||
|
||||
if (h->s && !h->s->allocated)
|
||||
ec_stripe_new_cancel(c, h, -EINTR);
|
||||
|
||||
h->rw_devs_change_count = c->rw_devs_change_count;
|
||||
}
|
||||
|
||||
static struct ec_stripe_head *
|
||||
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
|
||||
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
|
||||
unsigned algo, unsigned redundancy,
|
||||
enum bch_watermark watermark)
|
||||
{
|
||||
@ -1705,34 +1782,11 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
|
||||
mutex_init(&h->lock);
|
||||
BUG_ON(!mutex_trylock(&h->lock));
|
||||
|
||||
h->target = target;
|
||||
h->disk_label = disk_label;
|
||||
h->algo = algo;
|
||||
h->redundancy = redundancy;
|
||||
h->watermark = watermark;
|
||||
|
||||
rcu_read_lock();
|
||||
h->devs = target_rw_devs(c, BCH_DATA_user, target);
|
||||
|
||||
for_each_member_device_rcu(c, ca, &h->devs)
|
||||
if (!ca->mi.durability)
|
||||
__clear_bit(ca->dev_idx, h->devs.d);
|
||||
|
||||
h->blocksize = pick_blocksize(c, &h->devs);
|
||||
|
||||
for_each_member_device_rcu(c, ca, &h->devs)
|
||||
if (ca->mi.bucket_size == h->blocksize)
|
||||
h->nr_active_devs++;
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* If we only have redundancy + 1 devices, we're better off with just
|
||||
* replication:
|
||||
*/
|
||||
if (h->nr_active_devs < h->redundancy + 2)
|
||||
bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
|
||||
h->nr_active_devs, h->redundancy + 2);
|
||||
|
||||
list_add(&h->list, &c->ec_stripe_head_list);
|
||||
return h;
|
||||
}
|
||||
@ -1743,14 +1797,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
h->s->allocated &&
|
||||
bitmap_weight(h->s->blocks_allocated,
|
||||
h->s->nr_data) == h->s->nr_data)
|
||||
ec_stripe_set_pending(c, h);
|
||||
ec_stripe_new_set_pending(c, h);
|
||||
|
||||
mutex_unlock(&h->lock);
|
||||
}
|
||||
|
||||
static struct ec_stripe_head *
|
||||
__bch2_ec_stripe_head_get(struct btree_trans *trans,
|
||||
unsigned target,
|
||||
unsigned disk_label,
|
||||
unsigned algo,
|
||||
unsigned redundancy,
|
||||
enum bch_watermark watermark)
|
||||
@ -1768,27 +1822,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
|
||||
|
||||
if (test_bit(BCH_FS_going_ro, &c->flags)) {
|
||||
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
|
||||
goto found;
|
||||
goto err;
|
||||
}
|
||||
|
||||
list_for_each_entry(h, &c->ec_stripe_head_list, list)
|
||||
if (h->target == target &&
|
||||
if (h->disk_label == disk_label &&
|
||||
h->algo == algo &&
|
||||
h->redundancy == redundancy &&
|
||||
h->watermark == watermark) {
|
||||
ret = bch2_trans_mutex_lock(trans, &h->lock);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
h = ERR_PTR(ret);
|
||||
goto err;
|
||||
}
|
||||
goto found;
|
||||
}
|
||||
|
||||
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
|
||||
h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
|
||||
found:
|
||||
if (!IS_ERR_OR_NULL(h) &&
|
||||
h->nr_active_devs < h->redundancy + 2) {
|
||||
if (h->rw_devs_change_count != c->rw_devs_change_count)
|
||||
ec_stripe_head_devs_update(c, h);
|
||||
|
||||
if (h->insufficient_devs) {
|
||||
mutex_unlock(&h->lock);
|
||||
h = NULL;
|
||||
}
|
||||
err:
|
||||
mutex_unlock(&c->ec_stripe_head_lock);
|
||||
return h;
|
||||
}
|
||||
@ -1796,38 +1855,39 @@ found:
|
||||
static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
|
||||
enum bch_watermark watermark, struct closure *cl)
|
||||
{
|
||||
struct ec_stripe_new *s = h->s;
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_devs_mask devs = h->devs;
|
||||
struct open_bucket *ob;
|
||||
struct open_buckets buckets;
|
||||
struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
|
||||
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
|
||||
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
|
||||
bool have_cache = true;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
|
||||
BUG_ON(v->nr_redundant != h->s->nr_parity);
|
||||
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
|
||||
BUG_ON(v->nr_redundant != s->nr_parity);
|
||||
|
||||
/* * We bypass the sector allocator which normally does this: */
|
||||
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
|
||||
|
||||
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
|
||||
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
|
||||
__clear_bit(v->ptrs[i].dev, devs.d);
|
||||
if (i < h->s->nr_data)
|
||||
if (i < s->nr_data)
|
||||
nr_have_data++;
|
||||
else
|
||||
nr_have_parity++;
|
||||
}
|
||||
|
||||
BUG_ON(nr_have_data > h->s->nr_data);
|
||||
BUG_ON(nr_have_parity > h->s->nr_parity);
|
||||
BUG_ON(nr_have_data > s->nr_data);
|
||||
BUG_ON(nr_have_parity > s->nr_parity);
|
||||
|
||||
buckets.nr = 0;
|
||||
if (nr_have_parity < h->s->nr_parity) {
|
||||
if (nr_have_parity < s->nr_parity) {
|
||||
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
|
||||
&h->parity_stripe,
|
||||
&devs,
|
||||
h->s->nr_parity,
|
||||
s->nr_parity,
|
||||
&nr_have_parity,
|
||||
&have_cache, 0,
|
||||
BCH_DATA_parity,
|
||||
@ -1835,14 +1895,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
cl);
|
||||
|
||||
open_bucket_for_each(c, &buckets, ob, i) {
|
||||
j = find_next_zero_bit(h->s->blocks_gotten,
|
||||
h->s->nr_data + h->s->nr_parity,
|
||||
h->s->nr_data);
|
||||
BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
|
||||
j = find_next_zero_bit(s->blocks_gotten,
|
||||
s->nr_data + s->nr_parity,
|
||||
s->nr_data);
|
||||
BUG_ON(j >= s->nr_data + s->nr_parity);
|
||||
|
||||
h->s->blocks[j] = buckets.v[i];
|
||||
s->blocks[j] = buckets.v[i];
|
||||
v->ptrs[j] = bch2_ob_ptr(c, ob);
|
||||
__set_bit(j, h->s->blocks_gotten);
|
||||
__set_bit(j, s->blocks_gotten);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
@ -1850,11 +1910,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
}
|
||||
|
||||
buckets.nr = 0;
|
||||
if (nr_have_data < h->s->nr_data) {
|
||||
if (nr_have_data < s->nr_data) {
|
||||
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
|
||||
&h->block_stripe,
|
||||
&devs,
|
||||
h->s->nr_data,
|
||||
s->nr_data,
|
||||
&nr_have_data,
|
||||
&have_cache, 0,
|
||||
BCH_DATA_user,
|
||||
@ -1862,13 +1922,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
cl);
|
||||
|
||||
open_bucket_for_each(c, &buckets, ob, i) {
|
||||
j = find_next_zero_bit(h->s->blocks_gotten,
|
||||
h->s->nr_data, 0);
|
||||
BUG_ON(j >= h->s->nr_data);
|
||||
j = find_next_zero_bit(s->blocks_gotten,
|
||||
s->nr_data, 0);
|
||||
BUG_ON(j >= s->nr_data);
|
||||
|
||||
h->s->blocks[j] = buckets.v[i];
|
||||
s->blocks[j] = buckets.v[i];
|
||||
v->ptrs[j] = bch2_ob_ptr(c, ob);
|
||||
__set_bit(j, h->s->blocks_gotten);
|
||||
__set_bit(j, s->blocks_gotten);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
@ -1878,7 +1938,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* XXX: doesn't obey target: */
|
||||
static s64 get_existing_stripe(struct bch_fs *c,
|
||||
struct ec_stripe_head *head)
|
||||
{
|
||||
@ -1901,7 +1960,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
|
||||
|
||||
m = genradix_ptr(&c->stripes, stripe_idx);
|
||||
|
||||
if (m->algorithm == head->algo &&
|
||||
if (m->disk_label == head->disk_label &&
|
||||
m->algorithm == head->algo &&
|
||||
m->nr_redundant == head->redundancy &&
|
||||
m->sectors == head->blocksize &&
|
||||
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
|
||||
@ -1914,12 +1974,53 @@ static s64 get_existing_stripe(struct bch_fs *c,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
|
||||
{
|
||||
struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
|
||||
struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
|
||||
unsigned i;
|
||||
|
||||
BUG_ON(existing_v->nr_redundant != s->nr_parity);
|
||||
s->nr_data = existing_v->nr_blocks -
|
||||
existing_v->nr_redundant;
|
||||
|
||||
int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, s);
|
||||
return ret;
|
||||
}
|
||||
|
||||
BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
|
||||
|
||||
/*
|
||||
* Free buckets we initially allocated - they might conflict with
|
||||
* blocks from the stripe we're reusing:
|
||||
*/
|
||||
for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
|
||||
bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
|
||||
s->blocks[i] = 0;
|
||||
}
|
||||
memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
|
||||
memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
|
||||
|
||||
for (i = 0; i < existing_v->nr_blocks; i++) {
|
||||
if (stripe_blockcount_get(existing_v, i)) {
|
||||
__set_bit(i, s->blocks_gotten);
|
||||
__set_bit(i, s->blocks_allocated);
|
||||
}
|
||||
|
||||
ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
|
||||
}
|
||||
|
||||
bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
|
||||
s->have_existing_stripe = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
|
||||
struct bch_stripe *existing_v;
|
||||
unsigned i;
|
||||
s64 idx;
|
||||
int ret;
|
||||
|
||||
@ -1939,45 +2040,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
|
||||
return ret;
|
||||
}
|
||||
|
||||
existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
|
||||
|
||||
BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
|
||||
h->s->nr_data = existing_v->nr_blocks -
|
||||
existing_v->nr_redundant;
|
||||
|
||||
ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, h->s);
|
||||
return ret;
|
||||
}
|
||||
|
||||
BUG_ON(h->s->existing_stripe.size != h->blocksize);
|
||||
BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
|
||||
|
||||
/*
|
||||
* Free buckets we initially allocated - they might conflict with
|
||||
* blocks from the stripe we're reusing:
|
||||
*/
|
||||
for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
|
||||
bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
|
||||
h->s->blocks[i] = 0;
|
||||
}
|
||||
memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
|
||||
memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
|
||||
|
||||
for (i = 0; i < existing_v->nr_blocks; i++) {
|
||||
if (stripe_blockcount_get(existing_v, i)) {
|
||||
__set_bit(i, h->s->blocks_gotten);
|
||||
__set_bit(i, h->s->blocks_allocated);
|
||||
}
|
||||
|
||||
ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
|
||||
}
|
||||
|
||||
bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
|
||||
h->s->have_existing_stripe = true;
|
||||
|
||||
return 0;
|
||||
return init_new_stripe_from_existing(c, h->s);
|
||||
}
|
||||
|
||||
static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
|
||||
@ -2046,9 +2109,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
||||
struct bch_fs *c = trans->c;
|
||||
struct ec_stripe_head *h;
|
||||
bool waiting = false;
|
||||
unsigned disk_label = 0;
|
||||
struct target t = target_decode(target);
|
||||
int ret;
|
||||
|
||||
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
|
||||
if (t.type == TARGET_GROUP) {
|
||||
if (t.group > U8_MAX) {
|
||||
bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
|
||||
return NULL;
|
||||
}
|
||||
disk_label = t.group + 1; /* 0 == no label */
|
||||
}
|
||||
|
||||
h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
|
||||
if (IS_ERR_OR_NULL(h))
|
||||
return h;
|
||||
|
||||
@ -2126,6 +2199,79 @@ err:
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
/* device removal */
|
||||
|
||||
static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
|
||||
{
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
|
||||
|
||||
if (!a->stripe)
|
||||
return 0;
|
||||
|
||||
if (a->stripe_sectors) {
|
||||
bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
|
||||
return -BCH_ERR_invalidate_stripe_to_dev;
|
||||
}
|
||||
|
||||
struct btree_iter iter;
|
||||
struct bkey_i_stripe *s =
|
||||
bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
|
||||
BTREE_ITER_slots, stripe);
|
||||
int ret = PTR_ERR_OR_ZERO(s);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
struct disk_accounting_pos acc = {
|
||||
.type = BCH_DISK_ACCOUNTING_replicas,
|
||||
};
|
||||
|
||||
s64 sectors = 0;
|
||||
for (unsigned i = 0; i < s->v.nr_blocks; i++)
|
||||
sectors -= stripe_blockcount_get(&s->v, i);
|
||||
|
||||
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
|
||||
acc.replicas.data_type = BCH_DATA_user;
|
||||
ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
|
||||
bkey_for_each_ptr(ptrs, ptr)
|
||||
if (ptr->dev == k_a.k->p.inode) {
|
||||
if (stripe_blockcount_get(&s->v, ptr - &ptrs.start->ptr)) {
|
||||
bch_err(trans->c, "trying to invalidate device in stripe when stripe block not empty");
|
||||
ret = -BCH_ERR_invalidate_stripe_to_dev;
|
||||
goto err;
|
||||
}
|
||||
ptr->dev = BCH_SB_MEMBER_INVALID;
|
||||
}
|
||||
|
||||
sectors = -sectors;
|
||||
|
||||
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
|
||||
acc.replicas.data_type = BCH_DATA_user;
|
||||
ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
|
||||
if (ret)
|
||||
goto err;
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_dev_remove_stripes(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return bch2_trans_run(c,
|
||||
for_each_btree_key_upto_commit(trans, iter,
|
||||
BTREE_ID_alloc, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX),
|
||||
BTREE_ITER_intent, k,
|
||||
NULL, NULL, 0, ({
|
||||
bch2_invalidate_stripe_to_dev(trans, k);
|
||||
})));
|
||||
}
|
||||
|
||||
/* startup/shutdown */
|
||||
|
||||
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct ec_stripe_head *h;
|
||||
@ -2151,8 +2297,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
|
||||
}
|
||||
goto unlock;
|
||||
found:
|
||||
h->s->err = -BCH_ERR_erofs_no_writes;
|
||||
ec_stripe_set_pending(c, h);
|
||||
ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
|
||||
unlock:
|
||||
mutex_unlock(&h->lock);
|
||||
}
|
||||
@ -2193,21 +2338,13 @@ int bch2_stripes_read(struct bch_fs *c)
|
||||
if (k.k->type != KEY_TYPE_stripe)
|
||||
continue;
|
||||
|
||||
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
|
||||
ret = __ec_stripe_mem_alloc(c, k.k->p.offset);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
|
||||
|
||||
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
|
||||
m->sectors = le16_to_cpu(s->sectors);
|
||||
m->algorithm = s->algorithm;
|
||||
m->nr_blocks = s->nr_blocks;
|
||||
m->nr_redundant = s->nr_redundant;
|
||||
m->blocks_nonempty = 0;
|
||||
|
||||
for (unsigned i = 0; i < s->nr_blocks; i++)
|
||||
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
|
||||
stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
|
||||
|
||||
bch2_stripes_heap_insert(c, m, k.k->p.offset);
|
||||
0;
|
||||
@ -2252,6 +2389,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
|
||||
prt_printf(out, " %u", s->blocks[i]);
|
||||
prt_newline(out);
|
||||
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
@ -2261,9 +2400,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
|
||||
mutex_lock(&c->ec_stripe_head_lock);
|
||||
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
|
||||
prt_printf(out, "target %u algo %u redundancy %u %s:\n",
|
||||
h->target, h->algo, h->redundancy,
|
||||
bch2_watermarks[h->watermark]);
|
||||
prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
|
||||
h->disk_label, h->algo, h->redundancy,
|
||||
bch2_watermarks[h->watermark],
|
||||
h->nr_created);
|
||||
|
||||
if (h->s)
|
||||
bch2_new_stripe_to_text(out, c, h->s);
|
||||
|
@ -97,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe
|
||||
const struct bch_extent_ptr *data_ptr,
|
||||
unsigned sectors)
|
||||
{
|
||||
return data_ptr->dev == stripe_ptr->dev &&
|
||||
return (data_ptr->dev == stripe_ptr->dev ||
|
||||
data_ptr->dev == BCH_SB_MEMBER_INVALID ||
|
||||
stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
|
||||
data_ptr->gen == stripe_ptr->gen &&
|
||||
data_ptr->offset >= stripe_ptr->offset &&
|
||||
data_ptr->offset < stripe_ptr->offset + sectors;
|
||||
@ -186,10 +188,15 @@ struct ec_stripe_head {
|
||||
struct list_head list;
|
||||
struct mutex lock;
|
||||
|
||||
unsigned target;
|
||||
unsigned disk_label;
|
||||
unsigned algo;
|
||||
unsigned redundancy;
|
||||
enum bch_watermark watermark;
|
||||
bool insufficient_devs;
|
||||
|
||||
unsigned long rw_devs_change_count;
|
||||
|
||||
u64 nr_created;
|
||||
|
||||
struct bch_devs_mask devs;
|
||||
unsigned nr_active_devs;
|
||||
@ -202,7 +209,7 @@ struct ec_stripe_head {
|
||||
struct ec_stripe_new *s;
|
||||
};
|
||||
|
||||
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
|
||||
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
|
||||
|
||||
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
|
||||
|
||||
@ -247,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_dev_remove_stripes(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_fs_ec_stop(struct bch_fs *);
|
||||
void bch2_fs_ec_flush(struct bch_fs *);
|
||||
|
@ -11,7 +11,14 @@ struct bch_stripe {
|
||||
|
||||
__u8 csum_granularity_bits;
|
||||
__u8 csum_type;
|
||||
__u8 pad;
|
||||
|
||||
/*
|
||||
* XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
|
||||
*
|
||||
* we can manage with this because this only needs to point to a
|
||||
* disk label, not a target:
|
||||
*/
|
||||
__u8 disk_label;
|
||||
|
||||
struct bch_extent_ptr ptrs[];
|
||||
} __packed __aligned(8);
|
||||
|
@ -16,6 +16,7 @@ struct stripe {
|
||||
u8 nr_blocks;
|
||||
u8 nr_redundant;
|
||||
u8 blocks_nonempty;
|
||||
u8 disk_label;
|
||||
};
|
||||
|
||||
struct gc_stripe {
|
||||
|
@ -119,8 +119,8 @@
|
||||
x(EEXIST, EEXIST_str_hash_set) \
|
||||
x(EEXIST, EEXIST_discard_in_flight_add) \
|
||||
x(EEXIST, EEXIST_subvolume_create) \
|
||||
x(0, open_buckets_empty) \
|
||||
x(0, freelist_empty) \
|
||||
x(ENOSPC, open_buckets_empty) \
|
||||
x(ENOSPC, freelist_empty) \
|
||||
x(BCH_ERR_freelist_empty, no_buckets_found) \
|
||||
x(0, transaction_restart) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
|
||||
@ -244,6 +244,16 @@
|
||||
x(EIO, btree_node_read_error) \
|
||||
x(EIO, btree_node_read_validate_error) \
|
||||
x(EIO, btree_need_topology_repair) \
|
||||
x(EIO, bucket_ref_update) \
|
||||
x(EIO, trigger_pointer) \
|
||||
x(EIO, trigger_stripe_pointer) \
|
||||
x(EIO, metadata_bucket_inconsistency) \
|
||||
x(EIO, mark_stripe) \
|
||||
x(EIO, stripe_reconstruct) \
|
||||
x(EIO, key_type_error) \
|
||||
x(EIO, no_device_to_read_from) \
|
||||
x(EIO, missing_indirect_extent) \
|
||||
x(EIO, invalidate_stripe_to_dev) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
||||
@ -257,7 +267,6 @@
|
||||
x(BCH_ERR_nopromote, nopromote_in_flight) \
|
||||
x(BCH_ERR_nopromote, nopromote_no_writes) \
|
||||
x(BCH_ERR_nopromote, nopromote_enomem) \
|
||||
x(0, need_inode_lock) \
|
||||
x(0, invalid_snapshot_node) \
|
||||
x(0, option_needs_open_fs)
|
||||
|
||||
|
@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
int ret = 0;
|
||||
|
||||
if (k.k->type == KEY_TYPE_error)
|
||||
return -EIO;
|
||||
return -BCH_ERR_key_type_error;
|
||||
|
||||
rcu_read_lock();
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
* read:
|
||||
*/
|
||||
if (!ret && !p.ptr.cached)
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_no_device_to_read_from;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
|
||||
|
||||
@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
? f->idx
|
||||
: f->idx + 1;
|
||||
|
||||
if (!p.idx && !ca)
|
||||
if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
|
||||
p.idx++;
|
||||
|
||||
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
|
||||
p.idx++;
|
||||
|
||||
if (!p.idx && !bch2_dev_is_readable(ca))
|
||||
p.idx++;
|
||||
|
||||
if (p.idx >= (unsigned) p.has_ec + 1)
|
||||
if (p.idx > (unsigned) p.has_ec)
|
||||
continue;
|
||||
|
||||
if (ret > 0 && !ptr_better(c, p, *pick))
|
||||
@ -781,14 +778,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
|
||||
/*
|
||||
* Returns pointer to the next entry after the one being dropped:
|
||||
*/
|
||||
union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
|
||||
struct bch_extent_ptr *ptr)
|
||||
void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
|
||||
{
|
||||
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
|
||||
union bch_extent_entry *entry = to_entry(ptr), *next;
|
||||
union bch_extent_entry *ret = entry;
|
||||
bool drop_crc = true;
|
||||
|
||||
if (k.k->type == KEY_TYPE_stripe) {
|
||||
ptr->dev = BCH_SB_MEMBER_INVALID;
|
||||
return;
|
||||
}
|
||||
|
||||
EBUG_ON(ptr < &ptrs.start->ptr ||
|
||||
ptr >= &ptrs.end->ptr);
|
||||
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
|
||||
@ -811,20 +811,27 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
|
||||
break;
|
||||
|
||||
if ((extent_entry_is_crc(entry) && drop_crc) ||
|
||||
extent_entry_is_stripe_ptr(entry)) {
|
||||
ret = (void *) ret - extent_entry_bytes(entry);
|
||||
extent_entry_is_stripe_ptr(entry))
|
||||
extent_entry_drop(k, entry);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
|
||||
{
|
||||
if (k.k->type != KEY_TYPE_stripe) {
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
if (p.ptr.dev == ptr->dev && p.has_ec) {
|
||||
ptr->dev = BCH_SB_MEMBER_INVALID;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
|
||||
struct bch_extent_ptr *ptr)
|
||||
{
|
||||
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
|
||||
union bch_extent_entry *ret =
|
||||
|
||||
bch2_bkey_drop_ptr_noerror(k, ptr);
|
||||
|
||||
/*
|
||||
@ -837,14 +844,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
|
||||
!bch2_bkey_dirty_devs(k.s_c).nr) {
|
||||
k.k->type = KEY_TYPE_error;
|
||||
set_bkey_val_u64s(k.k, 0);
|
||||
ret = NULL;
|
||||
} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
|
||||
k.k->type = KEY_TYPE_deleted;
|
||||
set_bkey_val_u64s(k.k, 0);
|
||||
ret = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
|
||||
@ -854,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
|
||||
|
||||
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
|
||||
{
|
||||
struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
|
||||
|
||||
if (ptr)
|
||||
bch2_bkey_drop_ptr_noerror(k, ptr);
|
||||
bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
|
||||
}
|
||||
|
||||
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
|
||||
@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
|
||||
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
|
||||
if (p1.ptr.dev == p2.ptr.dev &&
|
||||
p1.ptr.gen == p2.ptr.gen &&
|
||||
|
||||
/*
|
||||
* This checks that the two pointers point
|
||||
* to the same region on disk - adjusting
|
||||
* for the difference in where the extents
|
||||
* start, since one may have been trimmed:
|
||||
*/
|
||||
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
|
||||
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
|
||||
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
|
||||
|
||||
/*
|
||||
* This additionally checks that the
|
||||
* extents overlap on disk, since the
|
||||
* previous check may trigger spuriously
|
||||
* when one extent is immediately partially
|
||||
* overwritten with another extent (so that
|
||||
* on disk they are adjacent) and
|
||||
* compression is in use:
|
||||
*/
|
||||
((p1.ptr.offset >= p2.ptr.offset &&
|
||||
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
|
||||
(p2.ptr.offset >= p1.ptr.offset &&
|
||||
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d
|
||||
unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
|
||||
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
void bch2_bkey_drop_device(struct bkey_s, unsigned);
|
||||
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
|
||||
|
||||
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
|
||||
|
||||
static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
|
||||
@ -649,25 +646,37 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
|
||||
|
||||
void bch2_extent_ptr_decoded_append(struct bkey_i *,
|
||||
struct extent_ptr_decoded *);
|
||||
union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
|
||||
struct bch_extent_ptr *);
|
||||
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
|
||||
struct bch_extent_ptr *);
|
||||
void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
|
||||
void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
|
||||
|
||||
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
|
||||
void bch2_bkey_drop_device(struct bkey_s, unsigned);
|
||||
|
||||
#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
|
||||
do { \
|
||||
__label__ _again; \
|
||||
struct bkey_ptrs _ptrs; \
|
||||
_again: \
|
||||
_ptrs = bch2_bkey_ptrs(_k); \
|
||||
\
|
||||
bkey_for_each_ptr(_ptrs, _ptr) \
|
||||
if (_cond) { \
|
||||
bch2_bkey_drop_ptr_noerror(_k, _ptr); \
|
||||
goto _again; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
|
||||
do { \
|
||||
struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
|
||||
\
|
||||
struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \
|
||||
\
|
||||
while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
|
||||
if (_cond) { \
|
||||
_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
|
||||
__label__ _again; \
|
||||
struct bkey_ptrs _ptrs; \
|
||||
_again: \
|
||||
_ptrs = bch2_bkey_ptrs(_k); \
|
||||
continue; \
|
||||
} \
|
||||
\
|
||||
(_ptr)++; \
|
||||
bkey_for_each_ptr(_ptrs, _ptr) \
|
||||
if (_cond) { \
|
||||
bch2_bkey_drop_ptr(_k, _ptr); \
|
||||
goto _again; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
|
||||
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
|
||||
BTREE_ITER_intent|BTREE_ITER_with_updates);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans,
|
||||
name,
|
||||
dir_target,
|
||||
&dir_offset,
|
||||
STR_HASH_must_create);
|
||||
STR_HASH_must_create|BTREE_ITER_with_updates);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
|
@ -791,8 +791,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
|
||||
static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
struct address_space *mapping,
|
||||
struct iov_iter *iter,
|
||||
loff_t pos, unsigned len,
|
||||
bool inode_locked)
|
||||
loff_t pos, unsigned len)
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct bch2_folio_reservation res;
|
||||
@ -816,15 +815,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
|
||||
BUG_ON(!fs.nr);
|
||||
|
||||
/*
|
||||
* If we're not using the inode lock, we need to lock all the folios for
|
||||
* atomiticity of writes vs. other writes:
|
||||
*/
|
||||
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
|
||||
ret = -BCH_ERR_need_inode_lock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
f = darray_first(fs);
|
||||
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
|
||||
ret = bch2_read_single_folio(f, mapping);
|
||||
@ -921,10 +911,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
end = pos + copied;
|
||||
|
||||
spin_lock(&inode->v.i_lock);
|
||||
if (end > inode->v.i_size) {
|
||||
BUG_ON(!inode_locked);
|
||||
if (end > inode->v.i_size)
|
||||
i_size_write(&inode->v, end);
|
||||
}
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
|
||||
f_pos = pos;
|
||||
@ -968,68 +956,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
loff_t pos;
|
||||
bool inode_locked = false;
|
||||
ssize_t written = 0, written2 = 0, ret = 0;
|
||||
|
||||
/*
|
||||
* We don't take the inode lock unless i_size will be changing. Folio
|
||||
* locks provide exclusion with other writes, and the pagecache add lock
|
||||
* provides exclusion with truncate and hole punching.
|
||||
*
|
||||
* There is one nasty corner case where atomicity would be broken
|
||||
* without great care: when copying data from userspace to the page
|
||||
* cache, we do that with faults disable - a page fault would recurse
|
||||
* back into the filesystem, taking filesystem locks again, and
|
||||
* deadlock; so it's done with faults disabled, and we fault in the user
|
||||
* buffer when we aren't holding locks.
|
||||
*
|
||||
* If we do part of the write, but we then race and in the userspace
|
||||
* buffer have been evicted and are no longer resident, then we have to
|
||||
* drop our folio locks to re-fault them in, breaking write atomicity.
|
||||
*
|
||||
* To fix this, we restart the write from the start, if we weren't
|
||||
* holding the inode lock.
|
||||
*
|
||||
* There is another wrinkle after that; if we restart the write from the
|
||||
* start, and then get an unrecoverable error, we _cannot_ claim to
|
||||
* userspace that we did not write data we actually did - so we must
|
||||
* track (written2) the most we ever wrote.
|
||||
*/
|
||||
|
||||
if ((iocb->ki_flags & IOCB_APPEND) ||
|
||||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
|
||||
inode_lock(&inode->v);
|
||||
inode_locked = true;
|
||||
}
|
||||
|
||||
ret = generic_write_checks(iocb, iter);
|
||||
if (ret <= 0)
|
||||
goto unlock;
|
||||
|
||||
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
|
||||
if (ret) {
|
||||
if (!inode_locked) {
|
||||
inode_lock(&inode->v);
|
||||
inode_locked = true;
|
||||
ret = file_remove_privs_flags(file, 0);
|
||||
}
|
||||
if (ret)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ret = file_update_time(file);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
pos = iocb->ki_pos;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
ssize_t written = 0;
|
||||
int ret = 0;
|
||||
|
||||
bch2_pagecache_add_get(inode);
|
||||
|
||||
if (!inode_locked &&
|
||||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
|
||||
goto get_inode_lock;
|
||||
|
||||
do {
|
||||
unsigned offset = pos & (PAGE_SIZE - 1);
|
||||
unsigned bytes = iov_iter_count(iter);
|
||||
@ -1054,17 +986,12 @@ again:
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
|
||||
goto get_inode_lock;
|
||||
|
||||
if (unlikely(fatal_signal_pending(current))) {
|
||||
ret = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
|
||||
if (ret == -BCH_ERR_need_inode_lock)
|
||||
goto get_inode_lock;
|
||||
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
|
||||
if (unlikely(ret < 0))
|
||||
break;
|
||||
|
||||
@ -1085,46 +1012,50 @@ again:
|
||||
}
|
||||
pos += ret;
|
||||
written += ret;
|
||||
written2 = max(written, written2);
|
||||
|
||||
if (ret != bytes && !inode_locked)
|
||||
goto get_inode_lock;
|
||||
ret = 0;
|
||||
|
||||
balance_dirty_pages_ratelimited(mapping);
|
||||
|
||||
if (0) {
|
||||
get_inode_lock:
|
||||
bch2_pagecache_add_put(inode);
|
||||
inode_lock(&inode->v);
|
||||
inode_locked = true;
|
||||
bch2_pagecache_add_get(inode);
|
||||
|
||||
iov_iter_revert(iter, written);
|
||||
pos -= written;
|
||||
written = 0;
|
||||
ret = 0;
|
||||
}
|
||||
} while (iov_iter_count(iter));
|
||||
|
||||
bch2_pagecache_add_put(inode);
|
||||
|
||||
return written ? written : ret;
|
||||
}
|
||||
|
||||
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
ssize_t ret;
|
||||
|
||||
if (iocb->ki_flags & IOCB_DIRECT) {
|
||||
ret = bch2_direct_write(iocb, from);
|
||||
goto out;
|
||||
}
|
||||
|
||||
inode_lock(&inode->v);
|
||||
|
||||
ret = generic_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
goto unlock;
|
||||
|
||||
ret = file_remove_privs(file);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
ret = file_update_time(file);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
ret = bch2_buffered_write(iocb, from);
|
||||
if (likely(ret > 0))
|
||||
iocb->ki_pos += ret;
|
||||
unlock:
|
||||
if (inode_locked)
|
||||
inode_unlock(&inode->v);
|
||||
|
||||
iocb->ki_pos += written;
|
||||
|
||||
ret = max(written, written2) ?: ret;
|
||||
if (ret > 0)
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
|
||||
? bch2_direct_write(iocb, iter)
|
||||
: bch2_buffered_write(iocb, iter);
|
||||
|
||||
out:
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
|
122
libbcachefs/fs.c
122
libbcachefs/fs.c
@ -273,14 +273,6 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
#define memalloc_flags_do(_flags, _do) \
|
||||
({ \
|
||||
unsigned _saved_flags = memalloc_flags_save(_flags); \
|
||||
typeof(_do) _ret = _do; \
|
||||
memalloc_noreclaim_restore(_saved_flags); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
static struct inode *bch2_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
BUG();
|
||||
@ -380,6 +372,8 @@ __bch2_create(struct mnt_idmap *idmap,
|
||||
subvol_inum inum;
|
||||
struct bch_subvolume subvol;
|
||||
u64 journal_seq = 0;
|
||||
kuid_t kuid;
|
||||
kgid_t kgid;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
@ -406,13 +400,15 @@ __bch2_create(struct mnt_idmap *idmap,
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
|
||||
kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
|
||||
ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
|
||||
bch2_create_trans(trans,
|
||||
inode_inum(dir), &dir_u, &inode_u,
|
||||
!(flags & BCH_CREATE_TMPFILE)
|
||||
? &dentry->d_name : NULL,
|
||||
from_kuid(i_user_ns(&dir->v), current_fsuid()),
|
||||
from_kgid(i_user_ns(&dir->v), current_fsgid()),
|
||||
from_kuid(i_user_ns(&dir->v), kuid),
|
||||
from_kgid(i_user_ns(&dir->v), kgid),
|
||||
mode, rdev,
|
||||
default_acl, acl, snapshot_src, flags) ?:
|
||||
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
|
||||
@ -727,15 +723,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
|
||||
struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
|
||||
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
|
||||
struct bch_inode_unpacked dst_dir_u, src_dir_u;
|
||||
struct bch_inode_unpacked src_inode_u, dst_inode_u;
|
||||
struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
|
||||
struct btree_trans *trans;
|
||||
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
|
||||
? BCH_RENAME_EXCHANGE
|
||||
: dst_dentry->d_inode
|
||||
? BCH_RENAME_OVERWRITE : BCH_RENAME;
|
||||
bool whiteout = !!(flags & RENAME_WHITEOUT);
|
||||
int ret;
|
||||
|
||||
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
|
||||
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
|
||||
return -EINVAL;
|
||||
|
||||
if (mode == BCH_RENAME_OVERWRITE) {
|
||||
@ -776,18 +773,48 @@ static int bch2_rename2(struct mnt_idmap *idmap,
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = commit_do(trans, NULL, NULL, 0,
|
||||
bch2_rename_trans(trans,
|
||||
ret = bch2_rename_trans(trans,
|
||||
inode_inum(src_dir), &src_dir_u,
|
||||
inode_inum(dst_dir), &dst_dir_u,
|
||||
&src_inode_u,
|
||||
&dst_inode_u,
|
||||
&src_dentry->d_name,
|
||||
&dst_dentry->d_name,
|
||||
mode));
|
||||
mode);
|
||||
if (unlikely(ret))
|
||||
goto err_tx_restart;
|
||||
|
||||
if (whiteout) {
|
||||
whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
|
||||
ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
|
||||
if (unlikely(ret))
|
||||
goto err_tx_restart;
|
||||
bch2_inode_init_early(c, whiteout_inode_u);
|
||||
|
||||
ret = bch2_create_trans(trans,
|
||||
inode_inum(src_dir), &src_dir_u,
|
||||
whiteout_inode_u,
|
||||
&src_dentry->d_name,
|
||||
from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
|
||||
from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
|
||||
S_IFCHR|WHITEOUT_MODE, 0,
|
||||
NULL, NULL, (subvol_inum) { 0 }, 0) ?:
|
||||
bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
|
||||
KEY_TYPE_QUOTA_PREALLOC);
|
||||
if (unlikely(ret))
|
||||
goto err_tx_restart;
|
||||
}
|
||||
|
||||
ret = bch2_trans_commit(trans, NULL, NULL, 0);
|
||||
if (unlikely(ret)) {
|
||||
err_tx_restart:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
goto err;
|
||||
}
|
||||
|
||||
BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
|
||||
BUG_ON(dst_inode &&
|
||||
@ -835,11 +862,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
unsigned int ia_valid = attr->ia_valid;
|
||||
kuid_t kuid;
|
||||
kgid_t kgid;
|
||||
|
||||
if (ia_valid & ATTR_UID)
|
||||
bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
|
||||
if (ia_valid & ATTR_GID)
|
||||
bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
|
||||
if (ia_valid & ATTR_UID) {
|
||||
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
|
||||
bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
|
||||
}
|
||||
if (ia_valid & ATTR_GID) {
|
||||
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
|
||||
bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
|
||||
}
|
||||
|
||||
if (ia_valid & ATTR_SIZE)
|
||||
bi->bi_size = attr->ia_size;
|
||||
@ -854,11 +887,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
|
||||
if (ia_valid & ATTR_MODE) {
|
||||
umode_t mode = attr->ia_mode;
|
||||
kgid_t gid = ia_valid & ATTR_GID
|
||||
? attr->ia_gid
|
||||
? kgid
|
||||
: inode->v.i_gid;
|
||||
|
||||
if (!in_group_p(gid) &&
|
||||
!capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
|
||||
if (!in_group_or_capable(idmap, &inode->v,
|
||||
make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
|
||||
mode &= ~S_ISGID;
|
||||
bi->bi_mode = mode;
|
||||
}
|
||||
@ -874,17 +907,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
|
||||
struct btree_iter inode_iter = { NULL };
|
||||
struct bch_inode_unpacked inode_u;
|
||||
struct posix_acl *acl = NULL;
|
||||
kuid_t kuid;
|
||||
kgid_t kgid;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
|
||||
qid = inode->ei_qid;
|
||||
|
||||
if (attr->ia_valid & ATTR_UID)
|
||||
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
|
||||
if (attr->ia_valid & ATTR_UID) {
|
||||
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
|
||||
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
|
||||
}
|
||||
|
||||
if (attr->ia_valid & ATTR_GID)
|
||||
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
|
||||
if (attr->ia_valid & ATTR_GID) {
|
||||
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
|
||||
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
|
||||
}
|
||||
|
||||
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
|
||||
KEY_TYPE_QUOTA_PREALLOC);
|
||||
@ -940,13 +979,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
|
||||
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
|
||||
|
||||
stat->dev = inode->v.i_sb->s_dev;
|
||||
stat->ino = inode->v.i_ino;
|
||||
stat->mode = inode->v.i_mode;
|
||||
stat->nlink = inode->v.i_nlink;
|
||||
stat->uid = inode->v.i_uid;
|
||||
stat->gid = inode->v.i_gid;
|
||||
stat->uid = vfsuid_into_kuid(vfsuid);
|
||||
stat->gid = vfsgid_into_kgid(vfsgid);
|
||||
stat->rdev = inode->v.i_rdev;
|
||||
stat->size = i_size_read(&inode->v);
|
||||
stat->atime = inode_get_atime(&inode->v);
|
||||
@ -1865,30 +1906,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
|
||||
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
|
||||
{
|
||||
struct bch_fs *c = root->d_sb->s_fs_info;
|
||||
enum bch_opt_id i;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
for (i = 0; i < bch2_opts_nr; i++) {
|
||||
const struct bch_option *opt = &bch2_opt_table[i];
|
||||
u64 v = bch2_opt_get_by_id(&c->opts, i);
|
||||
|
||||
if ((opt->flags & OPT_HIDDEN) ||
|
||||
!(opt->flags & OPT_MOUNT))
|
||||
continue;
|
||||
|
||||
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
|
||||
continue;
|
||||
|
||||
printbuf_reset(&buf);
|
||||
bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
|
||||
OPT_SHOW_MOUNT_STYLE);
|
||||
seq_putc(seq, ',');
|
||||
bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
|
||||
OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
|
||||
seq_puts(seq, buf.buf);
|
||||
}
|
||||
|
||||
if (buf.allocation_failure)
|
||||
ret = -ENOMEM;
|
||||
int ret = buf.allocation_failure ? -ENOMEM : 0;
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
@ -2209,7 +2233,7 @@ static struct file_system_type bcache_fs_type = {
|
||||
.name = "bcachefs",
|
||||
.init_fs_context = bch2_init_fs_context,
|
||||
.kill_sb = bch2_kill_sb,
|
||||
.fs_flags = FS_REQUIRES_DEV,
|
||||
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
|
||||
};
|
||||
|
||||
MODULE_ALIAS_FS("bcachefs");
|
||||
|
@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
|
||||
orig_k->k->k.size,
|
||||
reflink_offset);
|
||||
bch2_inconsistent_error(trans->c);
|
||||
ret = -EIO;
|
||||
ret = -BCH_ERR_missing_indirect_extent;
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -869,9 +869,15 @@ retry_pick:
|
||||
goto hole;
|
||||
|
||||
if (pick_ret < 0) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
|
||||
bch_err_inum_offset_ratelimited(c,
|
||||
read_pos.inode, read_pos.offset << 9,
|
||||
"no device to read from");
|
||||
"no device to read from: %s\n %s",
|
||||
bch2_err_str(pick_ret),
|
||||
buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
goto err;
|
||||
}
|
||||
|
||||
@ -1086,7 +1092,7 @@ get_bio:
|
||||
trans->notrace_relock_fail = true;
|
||||
} else {
|
||||
/* Attempting reconstruct read: */
|
||||
if (bch2_ec_read_extent(trans, rbio)) {
|
||||
if (bch2_ec_read_extent(trans, rbio, k)) {
|
||||
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
||||
goto out;
|
||||
}
|
||||
|
@ -1447,9 +1447,7 @@ again:
|
||||
op->nr_replicas_required,
|
||||
op->watermark,
|
||||
op->flags,
|
||||
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
|
||||
BCH_WRITE_ONLY_SPECIFIED_DEVS))
|
||||
? NULL : &op->cl, &wp));
|
||||
&op->cl, &wp));
|
||||
if (unlikely(ret)) {
|
||||
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
|
||||
break;
|
||||
@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write)
|
||||
BUG_ON(!op->write_point.v);
|
||||
BUG_ON(bkey_eq(op->pos, POS_MAX));
|
||||
|
||||
if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
|
||||
op->flags |= BCH_WRITE_ALLOC_NOWAIT;
|
||||
|
||||
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
|
||||
op->start_time = local_clock();
|
||||
bch2_keylist_init(&op->insert_keys, op->inline_keys);
|
||||
|
@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
|
||||
genradix_for_each(&c->journal_entries, radix_iter, _i) {
|
||||
struct bch_replicas_padded replicas = {
|
||||
.e.data_type = BCH_DATA_journal,
|
||||
.e.nr_devs = 0,
|
||||
.e.nr_required = 1,
|
||||
};
|
||||
|
||||
@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
|
||||
goto err;
|
||||
|
||||
darray_for_each(i->ptrs, ptr)
|
||||
replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
|
||||
replicas_entry_add_dev(&replicas.e, ptr->dev);
|
||||
|
||||
bch2_replicas_entry_sort(&replicas.e);
|
||||
|
||||
|
@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
|
||||
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
u64 seq_to_flush;
|
||||
size_t min_nr, min_key_cache, nr_flushed;
|
||||
@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
|
||||
if (j->watermark != BCH_WATERMARK_stripe)
|
||||
min_nr = 1;
|
||||
|
||||
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
|
||||
size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
|
||||
if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
|
||||
min_nr = 1;
|
||||
|
||||
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
|
||||
@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
|
||||
trace_and_count(c, journal_reclaim_start, c,
|
||||
direct, kicked,
|
||||
min_nr, min_key_cache,
|
||||
atomic_read(&c->btree_cache.dirty),
|
||||
c->btree_cache.used,
|
||||
atomic_long_read(&bc->nr_dirty), btree_cache_live,
|
||||
atomic_long_read(&c->btree_key_cache.nr_dirty),
|
||||
atomic_long_read(&c->btree_key_cache.nr_keys));
|
||||
|
||||
|
@ -432,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out,
|
||||
else
|
||||
prt_str(out, opt->choices[v]);
|
||||
break;
|
||||
case BCH_OPT_BITFIELD:
|
||||
prt_bitflags(out, opt->choices, v);
|
||||
break;
|
||||
case BCH_OPT_FN:
|
||||
opt->fn.to_text(out, c, sb, v);
|
||||
break;
|
||||
@ -440,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out,
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_opts_to_text(struct printbuf *out,
|
||||
struct bch_opts opts,
|
||||
struct bch_fs *c, struct bch_sb *sb,
|
||||
unsigned show_mask, unsigned hide_mask,
|
||||
unsigned flags)
|
||||
{
|
||||
bool first = true;
|
||||
|
||||
for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
|
||||
const struct bch_option *opt = &bch2_opt_table[i];
|
||||
|
||||
if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
|
||||
continue;
|
||||
|
||||
u64 v = bch2_opt_get_by_id(&opts, i);
|
||||
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
|
||||
continue;
|
||||
|
||||
if (!first)
|
||||
prt_char(out, ',');
|
||||
first = false;
|
||||
|
||||
bch2_opt_to_text(out, c, sb, opt, v, flags);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
|
||||
{
|
||||
int ret = 0;
|
||||
|
@ -373,6 +373,16 @@ enum fsck_err_opts {
|
||||
OPT_BOOL(), \
|
||||
BCH2_NO_SB_OPT, false, \
|
||||
NULL, "Exit recovery immediately prior to journal replay")\
|
||||
x(recovery_passes, u64, \
|
||||
OPT_FS|OPT_MOUNT, \
|
||||
OPT_BITFIELD(bch2_recovery_passes), \
|
||||
BCH2_NO_SB_OPT, 0, \
|
||||
NULL, "Recovery passes to run explicitly") \
|
||||
x(recovery_passes_exclude, u64, \
|
||||
OPT_FS|OPT_MOUNT, \
|
||||
OPT_BITFIELD(bch2_recovery_passes), \
|
||||
BCH2_NO_SB_OPT, 0, \
|
||||
NULL, "Recovery passes to exclude") \
|
||||
x(recovery_pass_last, u8, \
|
||||
OPT_FS|OPT_MOUNT, \
|
||||
OPT_STR_NOLIMIT(bch2_recovery_passes), \
|
||||
@ -595,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
|
||||
|
||||
void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
|
||||
const struct bch_option *, u64, unsigned);
|
||||
void bch2_opts_to_text(struct printbuf *,
|
||||
struct bch_opts,
|
||||
struct bch_fs *, struct bch_sb *,
|
||||
unsigned, unsigned, unsigned);
|
||||
|
||||
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
|
||||
int bch2_opts_check_may_set(struct bch_fs *);
|
||||
|
@ -219,9 +219,9 @@ static noinline void __process_finished_items(struct rcu_pending *pending,
|
||||
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
|
||||
|
||||
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
|
||||
kvfree(ptr);
|
||||
|
||||
bool free_head = ((unsigned long) obj->func) & 1UL;
|
||||
|
||||
kvfree(ptr);
|
||||
if (free_head)
|
||||
kfree(obj);
|
||||
}
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "inode.h"
|
||||
#include "io_write.h"
|
||||
#include "move.h"
|
||||
#include "rebalance.h"
|
||||
#include "subvolume.h"
|
||||
@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
|
||||
data_opts->rewrite_ptrs =
|
||||
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
|
||||
data_opts->target = r->target;
|
||||
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
||||
|
||||
if (!data_opts->rewrite_ptrs) {
|
||||
/*
|
||||
@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
|
||||
|
||||
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
|
||||
data_opts->target = target;
|
||||
data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
||||
return data_opts->rewrite_ptrs != 0;
|
||||
}
|
||||
|
||||
|
@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
|
||||
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
|
||||
|
||||
|
||||
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
|
||||
@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c)
|
||||
"error reading btree root %s l=%u: %s",
|
||||
bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
|
||||
if (btree_id_is_alloc(i)) {
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
|
||||
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
||||
r->error = 0;
|
||||
} else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
|
||||
} else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
|
||||
bch_info(c, "will run btree node scan");
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
@ -706,14 +706,14 @@ int bch2_fs_recovery(struct bch_fs *c)
|
||||
if (check_version_upgrade(c))
|
||||
write_sb = true;
|
||||
|
||||
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
|
||||
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
|
||||
|
||||
if (write_sb)
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
|
||||
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
|
||||
c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
|
||||
|
||||
if (c->opts.fsck)
|
||||
set_bit(BCH_FS_fsck_running, &c->flags);
|
||||
|
@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
|
||||
|
||||
set_bit(BCH_FS_may_go_rw, &c->flags);
|
||||
|
||||
if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
|
||||
if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
|
||||
return bch2_fs_read_write_early(c);
|
||||
return 0;
|
||||
}
|
||||
@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v)
|
||||
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
|
||||
enum bch_recovery_pass pass)
|
||||
{
|
||||
if (c->recovery_passes_explicit & BIT_ULL(pass))
|
||||
if (c->opts.recovery_passes & BIT_ULL(pass))
|
||||
return 0;
|
||||
|
||||
bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
|
||||
bch2_recovery_passes[pass], pass,
|
||||
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
|
||||
|
||||
c->recovery_passes_explicit |= BIT_ULL(pass);
|
||||
c->opts.recovery_passes |= BIT_ULL(pass);
|
||||
|
||||
if (c->curr_recovery_pass >= pass) {
|
||||
c->curr_recovery_pass = pass;
|
||||
@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
|
||||
{
|
||||
struct recovery_pass_fn *p = recovery_pass_fns + pass;
|
||||
|
||||
if (c->recovery_passes_explicit & BIT_ULL(pass))
|
||||
if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
|
||||
return false;
|
||||
if (c->opts.recovery_passes & BIT_ULL(pass))
|
||||
return true;
|
||||
if ((p->when & PASS_FSCK) && c->opts.fsck)
|
||||
return true;
|
||||
|
@ -82,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < r->nr_devs; i++)
|
||||
if (!bch2_member_exists(sb, r->devs[i])) {
|
||||
if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
|
||||
!bch2_member_exists(sb, r->devs[i])) {
|
||||
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
|
||||
goto bad;
|
||||
}
|
||||
@ -122,7 +123,7 @@ static void extent_to_replicas(struct bkey_s_c k,
|
||||
continue;
|
||||
|
||||
if (!p.has_ec)
|
||||
r->devs[r->nr_devs++] = p.ptr.dev;
|
||||
replicas_entry_add_dev(r, p.ptr.dev);
|
||||
else
|
||||
r->nr_required = 0;
|
||||
}
|
||||
@ -139,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
|
||||
for (ptr = s.v->ptrs;
|
||||
ptr < s.v->ptrs + s.v->nr_blocks;
|
||||
ptr++)
|
||||
r->devs[r->nr_devs++] = ptr->dev;
|
||||
replicas_entry_add_dev(r, ptr->dev);
|
||||
}
|
||||
|
||||
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
|
||||
@ -180,7 +181,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
|
||||
e->nr_required = 1;
|
||||
|
||||
darray_for_each(devs, i)
|
||||
e->devs[e->nr_devs++] = *i;
|
||||
replicas_entry_add_dev(e, *i);
|
||||
|
||||
bch2_replicas_entry_sort(e);
|
||||
}
|
||||
@ -795,11 +796,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
|
||||
nr_online += test_bit(e->devs[i], devs.d);
|
||||
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
|
||||
nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
|
||||
nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (nr_failed == e->nr_devs)
|
||||
if (nr_online + nr_failed == e->nr_devs)
|
||||
continue;
|
||||
|
||||
if (nr_online < e->nr_required)
|
||||
|
@ -5,7 +5,7 @@
|
||||
struct bch_replicas_entry_v0 {
|
||||
__u8 data_type;
|
||||
__u8 nr_devs;
|
||||
__u8 devs[];
|
||||
__u8 devs[] __counted_by(nr_devs);
|
||||
} __packed;
|
||||
|
||||
struct bch_sb_field_replicas_v0 {
|
||||
@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
|
||||
__u8 data_type;
|
||||
__u8 nr_devs;
|
||||
__u8 nr_required;
|
||||
__u8 devs[];
|
||||
__u8 devs[] __counted_by(nr_devs);
|
||||
} __packed;
|
||||
|
||||
struct bch_sb_field_replicas {
|
||||
@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
|
||||
#define replicas_entry_bytes(_i) \
|
||||
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
|
||||
|
||||
#define replicas_entry_add_dev(e, d) ({ \
|
||||
(e)->nr_devs++; \
|
||||
(e)->devs[(e)->nr_devs - 1] = (d); \
|
||||
})
|
||||
|
||||
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
|
||||
|
@ -288,10 +288,10 @@ enum bch_fsck_flags {
|
||||
x(invalid_btree_id, 274, 0) \
|
||||
x(alloc_key_io_time_bad, 275, 0) \
|
||||
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
|
||||
x(accounting_key_junk_at_end, 277, 0) \
|
||||
x(accounting_key_replicas_nr_devs_0, 278, 0) \
|
||||
x(accounting_key_replicas_nr_required_bad, 279, 0) \
|
||||
x(accounting_key_replicas_devs_unsorted, 280, 0) \
|
||||
x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
|
||||
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
|
||||
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
|
||||
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
|
||||
|
||||
enum bch_sb_error_id {
|
||||
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
void bch2_dev_missing(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
if (dev != BCH_SB_MEMBER_INVALID)
|
||||
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
|
||||
}
|
||||
|
||||
@ -473,3 +474,51 @@ unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
|
||||
nr += bch2_member_exists((struct bch_sb *) sb, i);
|
||||
return nr;
|
||||
}
|
||||
|
||||
int bch2_sb_member_alloc(struct bch_fs *c)
|
||||
{
|
||||
unsigned dev_idx = c->sb.nr_devices;
|
||||
struct bch_sb_field_members_v2 *mi;
|
||||
unsigned nr_devices;
|
||||
unsigned u64s;
|
||||
int best = -1;
|
||||
u64 best_last_mount = 0;
|
||||
|
||||
if (dev_idx < BCH_SB_MEMBERS_MAX)
|
||||
goto have_slot;
|
||||
|
||||
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
|
||||
/* eventually BCH_SB_MEMBERS_MAX will be raised */
|
||||
if (dev_idx == BCH_SB_MEMBER_INVALID)
|
||||
continue;
|
||||
|
||||
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
|
||||
if (bch2_member_alive(&m))
|
||||
continue;
|
||||
|
||||
u64 last_mount = le64_to_cpu(m.last_mount);
|
||||
if (best < 0 || last_mount < best_last_mount) {
|
||||
best = dev_idx;
|
||||
best_last_mount = last_mount;
|
||||
}
|
||||
}
|
||||
if (best >= 0) {
|
||||
dev_idx = best;
|
||||
goto have_slot;
|
||||
}
|
||||
|
||||
return -BCH_ERR_ENOSPC_sb_members;
|
||||
have_slot:
|
||||
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
|
||||
|
||||
mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
|
||||
u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
|
||||
le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
|
||||
|
||||
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
|
||||
if (!mi)
|
||||
return -BCH_ERR_ENOSPC_sb_members;
|
||||
|
||||
c->disk_sb.sb->nr_devices = nr_devices;
|
||||
return dev_idx;
|
||||
}
|
||||
|
@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
|
||||
lockdep_is_held(&c->state_lock));
|
||||
}
|
||||
|
||||
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
|
||||
static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
return c && dev < c->sb.nr_devices
|
||||
? rcu_dereference(c->devs[dev])
|
||||
: NULL;
|
||||
}
|
||||
|
||||
void bch2_dev_missing(struct bch_fs *, unsigned);
|
||||
|
||||
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
|
||||
if (unlikely(!ca))
|
||||
bch2_dev_missing(c, dev);
|
||||
return ca;
|
||||
}
|
||||
|
||||
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, dev);
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
|
||||
if (ca)
|
||||
bch2_dev_get(ca);
|
||||
rcu_read_unlock();
|
||||
return ca;
|
||||
}
|
||||
|
||||
void bch2_dev_missing(struct bch_fs *, unsigned);
|
||||
|
||||
static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
|
||||
{
|
||||
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
|
||||
if (!ca)
|
||||
if (unlikely(!ca))
|
||||
bch2_dev_missing(c, dev);
|
||||
return ca;
|
||||
}
|
||||
@ -354,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
|
||||
bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
int bch2_sb_member_alloc(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_SB_MEMBERS_H */
|
||||
|
@ -8,6 +8,11 @@
|
||||
*/
|
||||
#define BCH_SB_MEMBERS_MAX 64
|
||||
|
||||
/*
|
||||
* Sentinal value - indicates a device that does not exist
|
||||
*/
|
||||
#define BCH_SB_MEMBER_INVALID 255
|
||||
|
||||
#define BCH_MIN_NR_NBUCKETS (1 << 6)
|
||||
|
||||
#define BCH_IOPS_MEASUREMENTS() \
|
||||
|
@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
|
||||
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
|
||||
snapshot),
|
||||
POS(insert->k.p.inode, U64_MAX),
|
||||
BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
|
||||
BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
|
||||
if (is_visible_key(desc, inum, k)) {
|
||||
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
|
||||
goto found;
|
||||
|
@ -524,7 +524,7 @@ static void bch2_sb_update(struct bch_fs *c)
|
||||
c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
|
||||
|
||||
/* XXX this is wrong, we need a 96 or 128 bit integer type */
|
||||
c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
|
||||
c->sb.time_base_lo = div64_u64(le64_to_cpu(src->time_base_lo),
|
||||
c->sb.nsec_per_time_unit);
|
||||
c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
|
||||
|
||||
|
@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c)
|
||||
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
|
||||
c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
|
||||
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
|
||||
BUG_ON(atomic_read(&c->btree_cache.dirty));
|
||||
BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
|
||||
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
|
||||
BUG_ON(c->btree_write_buffer.inc.keys.nr);
|
||||
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
|
||||
@ -1592,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
/* Device add/removal: */
|
||||
|
||||
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct bpos start = POS(ca->dev_idx, 0);
|
||||
struct bpos end = POS(ca->dev_idx, U64_MAX);
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* We clear the LRU and need_discard btrees first so that we don't race
|
||||
* with bch2_do_invalidates() and bch2_do_discards()
|
||||
*/
|
||||
ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
|
||||
BTREE_TRIGGER_norun, NULL) ?:
|
||||
bch2_dev_usage_remove(c, ca->dev_idx);
|
||||
bch_err_msg(c, ret, "removing dev alloc info");
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
{
|
||||
struct bch_member *m;
|
||||
@ -1730,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
struct bch_opts opts = bch2_opts_empty();
|
||||
struct bch_sb_handle sb;
|
||||
struct bch_dev *ca = NULL;
|
||||
struct bch_sb_field_members_v2 *mi;
|
||||
struct bch_member dev_mi;
|
||||
unsigned dev_idx, nr_devices, u64s;
|
||||
struct printbuf errbuf = PRINTBUF;
|
||||
struct printbuf label = PRINTBUF;
|
||||
int ret;
|
||||
@ -1742,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
|
||||
struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
|
||||
|
||||
if (BCH_MEMBER_GROUP(&dev_mi)) {
|
||||
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
|
||||
@ -1780,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
|
||||
goto err_unlock;
|
||||
|
||||
if (dynamic_fault("bcachefs:add:no_slot"))
|
||||
goto no_slot;
|
||||
|
||||
if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
|
||||
dev_idx = c->sb.nr_devices;
|
||||
goto have_slot;
|
||||
}
|
||||
|
||||
int best = -1;
|
||||
u64 best_last_mount = 0;
|
||||
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
|
||||
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
|
||||
if (bch2_member_alive(&m))
|
||||
continue;
|
||||
|
||||
u64 last_mount = le64_to_cpu(m.last_mount);
|
||||
if (best < 0 || last_mount < best_last_mount) {
|
||||
best = dev_idx;
|
||||
best_last_mount = last_mount;
|
||||
}
|
||||
}
|
||||
if (best >= 0) {
|
||||
dev_idx = best;
|
||||
goto have_slot;
|
||||
}
|
||||
no_slot:
|
||||
ret = -BCH_ERR_ENOSPC_sb_members;
|
||||
bch_err_msg(c, ret, "setting up new superblock");
|
||||
goto err_unlock;
|
||||
|
||||
have_slot:
|
||||
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
|
||||
|
||||
mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
|
||||
u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
|
||||
le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
|
||||
|
||||
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
|
||||
if (!mi) {
|
||||
ret = -BCH_ERR_ENOSPC_sb_members;
|
||||
ret = bch2_sb_member_alloc(c);
|
||||
if (ret < 0) {
|
||||
bch_err_msg(c, ret, "setting up new superblock");
|
||||
goto err_unlock;
|
||||
}
|
||||
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
|
||||
unsigned dev_idx = ret;
|
||||
|
||||
/* success: */
|
||||
|
||||
*m = dev_mi;
|
||||
m->last_mount = cpu_to_le64(ktime_get_real_seconds());
|
||||
c->disk_sb.sb->nr_devices = nr_devices;
|
||||
dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
|
||||
*bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
|
||||
|
||||
ca->disk_sb.sb->dev_idx = dev_idx;
|
||||
bch2_dev_attach(c, ca, dev_idx);
|
||||
|
@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
|
||||
|
||||
static size_t bch2_btree_cache_size(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
size_t ret = 0;
|
||||
struct btree *b;
|
||||
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
list_for_each_entry(b, &c->btree_cache.live, list)
|
||||
mutex_lock(&bc->lock);
|
||||
list_for_each_entry(b, &bc->live[0].list, list)
|
||||
ret += btree_buf_bytes(b);
|
||||
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
list_for_each_entry(b, &bc->live[1].list, list)
|
||||
ret += btree_buf_bytes(b);
|
||||
list_for_each_entry(b, &bc->freeable, list)
|
||||
ret += btree_buf_bytes(b);
|
||||
mutex_unlock(&bc->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -287,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
|
||||
prt_tab_rjust(out);
|
||||
|
||||
prt_human_readable_u64(out, nr_extents
|
||||
? div_u64(sectors_uncompressed << 9, nr_extents)
|
||||
? div64_u64(sectors_uncompressed << 9, nr_extents)
|
||||
: 0);
|
||||
prt_tab_rjust(out);
|
||||
prt_newline(out);
|
||||
@ -444,11 +448,12 @@ STORE(bch2_fs)
|
||||
return -EROFS;
|
||||
|
||||
if (attr == &sysfs_trigger_btree_cache_shrink) {
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct shrink_control sc;
|
||||
|
||||
sc.gfp_mask = GFP_KERNEL;
|
||||
sc.nr_to_scan = strtoul_or_return(buf);
|
||||
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
|
||||
bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_trigger_btree_key_cache_shrink) {
|
||||
@ -456,7 +461,7 @@ STORE(bch2_fs)
|
||||
|
||||
sc.gfp_mask = GFP_KERNEL;
|
||||
sc.nr_to_scan = strtoul_or_return(buf);
|
||||
c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
|
||||
c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_trigger_gc)
|
||||
|
@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res)
|
||||
*res = 1;
|
||||
|
||||
while (p--) {
|
||||
if (*res > div_u64(U64_MAX, n))
|
||||
if (*res > div64_u64(U64_MAX, n))
|
||||
return -ERANGE;
|
||||
*res *= n;
|
||||
}
|
||||
@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res)
|
||||
|
||||
parse_or_ret(cp, parse_unit_suffix(cp, &b));
|
||||
|
||||
if (v > div_u64(U64_MAX, b))
|
||||
if (v > div64_u64(U64_MAX, b))
|
||||
return -ERANGE;
|
||||
v *= b;
|
||||
|
||||
if (f_n > div_u64(U64_MAX, b))
|
||||
if (f_n > div64_u64(U64_MAX, b))
|
||||
return -ERANGE;
|
||||
|
||||
f_n = div_u64(f_n * b, f_d);
|
||||
f_n = div64_u64(f_n * b, f_d);
|
||||
if (v + f_n < v)
|
||||
return -ERANGE;
|
||||
v += f_n;
|
||||
@ -214,7 +214,7 @@ u64 bch2_read_flag_list(const char *opt, const char * const list[])
|
||||
|
||||
s = strim(d);
|
||||
|
||||
while ((p = strsep(&s, ","))) {
|
||||
while ((p = strsep(&s, ",;"))) {
|
||||
int flag = match_string(list, -1, p);
|
||||
|
||||
if (flag < 0) {
|
||||
@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
|
||||
{
|
||||
const struct time_unit *u = bch2_pick_time_units(ns);
|
||||
|
||||
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
|
||||
prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
|
||||
}
|
||||
|
||||
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
|
||||
@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
|
||||
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
|
||||
|
||||
u64 q = max(quantiles->entries[i].m, last_q);
|
||||
prt_printf(out, "%llu ", div_u64(q, u->nsecs));
|
||||
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
|
||||
if (is_last)
|
||||
prt_newline(out);
|
||||
last_q = q;
|
||||
|
@ -13,7 +13,7 @@ struct bch_xattr {
|
||||
__u8 x_type;
|
||||
__u8 x_name_len;
|
||||
__le16 x_val_len;
|
||||
__u8 x_name[];
|
||||
__u8 x_name[] __counted_by(x_name_len);
|
||||
} __packed __aligned(8);
|
||||
|
||||
#endif /* _BCACHEFS_XATTR_FORMAT_H */
|
||||
|
Loading…
Reference in New Issue
Block a user