Update bcachefs sources to 9736cbbc5cc3 bcachefs: bs > ps support
Some checks are pending
build / bcachefs-tools-deb (ubuntu-22.04) (push) Waiting to run
build / bcachefs-tools-deb (ubuntu-24.04) (push) Waiting to run
build / bcachefs-tools-rpm (push) Waiting to run
build / bcachefs-tools-msrv (push) Waiting to run
Nix-Tests / nix-flake-check (push) Waiting to run

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-02-20 15:42:51 -05:00
parent 3e15e96cb9
commit dd1a882d17
77 changed files with 1800 additions and 838 deletions

View File

@ -1 +1 @@
63bbe0ca416791095c994aba7bea388e947dd60a
9736cbbc5cc39f6c666befdd787788b6ce6497f6

2
Cargo.lock generated
View File

@ -68,7 +68,7 @@ checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6"
[[package]]
name = "bcachefs-tools"
version = "1.12.0"
version = "1.20.0"
dependencies = [
"anyhow",
"bch_bindgen",

0
include/linux/unicode.h Normal file
View File

View File

@ -871,6 +871,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (data_type_is_empty(new_a->data_type) &&
BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
if (new_a->oldest_gen == new_a->gen &&
!bch2_bucket_sectors_total(*new_a))
new_a->oldest_gen++;
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
alloc_data_type_set(new_a, new_a->data_type);
@ -889,26 +892,20 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!new_a->io_time[READ])
new_a->io_time[READ] = bch2_current_io_time(c, READ);
u64 old_lru = alloc_lru_idx_read(*old_a);
u64 new_lru = alloc_lru_idx_read(*new_a);
if (old_lru != new_lru) {
ret = bch2_lru_change(trans, new.k->p.inode,
bucket_to_u64(new.k->p),
old_lru, new_lru);
if (ret)
goto err;
}
ret = bch2_lru_change(trans, new.k->p.inode,
bucket_to_u64(new.k->p),
alloc_lru_idx_read(*old_a),
alloc_lru_idx_read(*new_a));
if (ret)
goto err;
old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
if (old_lru != new_lru) {
ret = bch2_lru_change(trans,
BCH_LRU_FRAGMENTATION_START,
bucket_to_u64(new.k->p),
old_lru, new_lru);
if (ret)
goto err;
}
ret = bch2_lru_change(trans,
BCH_LRU_BUCKET_FRAGMENTATION,
bucket_to_u64(new.k->p),
alloc_lru_idx_fragmentation(*old_a, ca),
alloc_lru_idx_fragmentation(*new_a, ca));
if (ret)
goto err;
if (old_a->gen != new_a->gen) {
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
@ -1705,7 +1702,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
if (lru_idx) {
ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
bucket_to_u64(alloc_k.k->p),
lru_idx, alloc_k, last_flushed);
if (ret)
goto err;
@ -1735,7 +1733,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = &a_mut->v;
}
ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
a->io_time[READ],
alloc_k, last_flushed);
if (ret)
goto err;
@ -1757,7 +1757,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
bch2_check_stripe_to_lru_refs(c);
bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
@ -2058,16 +2059,71 @@ put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bp(struct btree_trans *trans,
struct bch_dev *ca,
struct bkey_s_c_backpointer bp,
struct bkey_buf *last_flushed)
{
struct btree_iter extent_iter;
struct bkey_s_c extent_k =
bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
int ret = bkey_err(extent_k);
if (ret)
return ret;
struct bkey_i *n =
bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
BTREE_UPDATE_internal_snapshot_node);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
err:
bch2_trans_iter_exit(trans, &extent_iter);
return ret;
}
static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
struct bch_dev *ca,
struct bpos bucket,
u8 gen,
struct bkey_buf *last_flushed)
{
struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
bp_start, bp_end, 0, k,
NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_no_enospc, ({
if (k.k->type != KEY_TYPE_backpointer)
continue;
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
if (bp.v->bucket_gen != gen)
continue;
/* filter out bps with gens that don't match */
invalidate_one_bp(trans, ca, bp, last_flushed);
}));
}
noinline_for_stack
static int invalidate_one_bucket(struct btree_trans *trans,
struct bch_dev *ca,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
struct bkey_buf *last_flushed,
s64 *nr_to_invalidate)
{
struct bch_fs *c = trans->c;
struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
unsigned cached_sectors;
struct btree_iter alloc_iter = {};
int ret = 0;
if (*nr_to_invalidate <= 0)
@ -2084,35 +2140,37 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
ret = PTR_ERR_OR_ZERO(a);
struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
BTREE_ID_alloc, bucket,
BTREE_ITER_cached);
ret = bkey_err(alloc_k);
if (ret)
goto out;
return ret;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
/* We expect harmless races here due to the btree write buffer: */
if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
BUG_ON(a->v.dirty_sectors);
/*
* Impossible since alloc_lru_idx_read() only returns nonzero if the
* bucket is supposed to be on the cached bucket LRU (i.e.
* BCH_DATA_cached)
*
* bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
*/
BUG_ON(a->data_type != BCH_DATA_cached);
BUG_ON(a->dirty_sectors);
if (!a->v.cached_sectors)
if (!a->cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
cached_sectors = a->v.cached_sectors;
unsigned cached_sectors = a->cached_sectors;
u8 gen = a->gen;
SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
a->v.gen++;
a->v.data_type = 0;
a->v.dirty_sectors = 0;
a->v.stripe_sectors = 0;
a->v.cached_sectors = 0;
a->v.io_time[READ] = bch2_current_io_time(c, READ);
a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
ret = bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_no_enospc);
ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
if (ret)
goto out;
@ -2120,6 +2178,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
--*nr_to_invalidate;
out:
fsck_err:
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
}
@ -2146,6 +2205,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
@ -2170,7 +2233,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (!k.k)
break;
ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
restart_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@ -2183,6 +2246,7 @@ restart_err:
err:
bch2_trans_put(trans);
percpu_ref_put(&ca->io_ref);
bch2_bkey_buf_exit(&last_flushed, c);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}

View File

@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
if (a.stripe)
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
if (bch2_bucket_sectors_dirty(a))
return data_type;
return bucket_data_type(data_type);
if (a.cached_sectors)
return BCH_DATA_cached;
if (BCH_ALLOC_V4_NEED_DISCARD(&a))

View File

@ -11,6 +11,7 @@
#include "checksum.h"
#include "disk_accounting.h"
#include "error.h"
#include "progress.h"
#include <linux/mm.h>
@ -518,6 +519,22 @@ check_existing_bp:
if (!other_extent.k)
goto missing;
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
if (ca) {
struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
bkey_for_each_ptr(other_extent_ptrs, ptr)
if (ptr->dev == bp->k.p.inode &&
dev_ptr_stale_rcu(ca, ptr)) {
ret = drop_dev_and_update(trans, other_bp.v->btree_id,
other_extent, bp->k.p.inode);
if (ret)
goto err;
goto out;
}
}
rcu_read_unlock();
if (bch2_extents_match(orig_k, other_extent)) {
printbuf_reset(&buf);
prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
@ -594,9 +611,6 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.ptr.cached)
continue;
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
continue;
@ -604,9 +618,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr));
rcu_read_unlock();
if (check || empty) {
if ((check || empty) && !stale) {
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
@ -719,71 +735,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
struct progress_indicator_state {
unsigned long next_print;
u64 nodes_seen;
u64 nodes_total;
struct btree *last_node;
};
static inline void progress_init(struct progress_indicator_state *s,
struct bch_fs *c,
u64 btree_id_mask)
{
memset(s, 0, sizeof(*s));
s->next_print = jiffies + HZ * 10;
for (unsigned i = 0; i < BTREE_ID_NR; i++) {
if (!(btree_id_mask & BIT_ULL(i)))
continue;
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_btree,
.btree.id = i,
};
u64 v;
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
s->nodes_total += div64_ul(v, btree_sectors(c));
}
}
static inline bool progress_update_p(struct progress_indicator_state *s)
{
bool ret = time_after_eq(jiffies, s->next_print);
if (ret)
s->next_print = jiffies + HZ * 10;
return ret;
}
static void progress_update_iter(struct btree_trans *trans,
struct progress_indicator_state *s,
struct btree_iter *iter,
const char *msg)
{
struct bch_fs *c = trans->c;
struct btree *b = path_l(btree_iter_path(trans, iter))->b;
s->nodes_seen += b != s->last_node;
s->last_node = b;
if (progress_update_p(s)) {
struct printbuf buf = PRINTBUF;
unsigned percent = s->nodes_total
? div64_u64(s->nodes_seen * 100, s->nodes_total)
: 0;
prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
msg, percent, s->nodes_seen, s->nodes_total);
bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
}
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
@ -791,7 +742,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct progress_indicator_state progress;
int ret = 0;
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
@ -810,7 +761,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@ -905,9 +856,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
goto err;
}
/* Cached pointers don't have backpointers: */
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
sectors[ALLOC_cached] != a->cached_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
@ -916,6 +866,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
}
if (sectors[ALLOC_dirty] > a->dirty_sectors ||
sectors[ALLOC_cached] > a->cached_sectors ||
sectors[ALLOC_stripe] > a->stripe_sectors) {
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
-BCH_ERR_transaction_restart_nested;
@ -923,7 +874,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
}
if (!sectors[ALLOC_dirty] &&
!sectors[ALLOC_stripe])
!sectors[ALLOC_stripe] &&
!sectors[ALLOC_cached])
__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
else
__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
@ -1210,11 +1162,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k, ({
progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
check_one_backpointer(trans, start, end, k, &last_flushed);
}));

View File

@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#ifndef _BCACHEFS_BACKPOINTERS_H
#define _BCACHEFS_BACKPOINTERS_H
#include "btree_cache.h"
#include "btree_iter.h"
@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
return BCH_DATA_btree;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
if (p.has_ec)
return BCH_DATA_stripe;
if (p.ptr.cached)
return BCH_DATA_cached;
else
return BCH_DATA_user;
case KEY_TYPE_stripe: {
const struct bch_extent_ptr *ptr = &entry->ptr;
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
struct bkey_i_backpointer *bp)
{
bkey_backpointer_init(&bp->k_i);
bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
bp->k.p.inode = p.ptr.dev;
if (k.k->type != KEY_TYPE_stripe)
bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
else {
/*
* Put stripe backpointers where they won't collide with the
* extent backpointers within the stripe:
*/
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
}
bp->v = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,

View File

@ -203,6 +203,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
#include <linux/unicode.h>
#include "bcachefs_format.h"
#include "btree_journal_iter_types.h"
@ -444,6 +445,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_sort) \
x(btree_node_read) \
x(btree_node_read_done) \
x(btree_node_write) \
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
@ -456,6 +458,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
x(blocked_journal_max_open) \
x(blocked_key_cache_flush) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
@ -697,6 +700,8 @@ enum bch_write_ref {
BCH_WRITE_REF_NR,
};
#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
struct bch_fs {
struct closure cl;
@ -781,6 +786,9 @@ struct bch_fs {
u64 btrees_lost_data;
} sb;
#ifdef CONFIG_UNICODE
struct unicode_map *cf_encoding;
#endif
struct bch_sb_handle disk_sb;

View File

@ -686,7 +686,11 @@ struct bch_sb_field_ext {
x(inode_depth, BCH_VERSION(1, 17)) \
x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
x(autofix_errors, BCH_VERSION(1, 19)) \
x(directory_size, BCH_VERSION(1, 20))
x(directory_size, BCH_VERSION(1, 20)) \
x(cached_backpointers, BCH_VERSION(1, 21)) \
x(stripe_backpointers, BCH_VERSION(1, 22)) \
x(stripe_lru, BCH_VERSION(1, 23)) \
x(casefolding, BCH_VERSION(1, 24))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -908,7 +912,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
x(journal_no_flush, 16) \
x(alloc_v2, 17) \
x(extents_across_btree_nodes, 18) \
x(incompat_version_field, 19)
x(incompat_version_field, 19) \
x(casefolding, 20)
#define BCH_SB_FEATURES_ALWAYS \
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \

View File

@ -234,6 +234,11 @@ enum bch_data_event {
BCH_DATA_EVENT_NR = 1,
};
enum data_progress_data_type_special {
DATA_PROGRESS_DATA_TYPE_phys = 254,
DATA_PROGRESS_DATA_TYPE_done = 255,
};
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;

View File

@ -27,6 +27,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "progress.h"
#include "recovery_passes.h"
#include "reflink.h"
#include "recovery.h"
@ -656,7 +657,9 @@ fsck_err:
return ret;
}
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
static int bch2_gc_btree(struct btree_trans *trans,
struct progress_indicator_state *progress,
enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
}));
@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id ids[BTREE_ID_NR];
struct printbuf buf = PRINTBUF;
unsigned i;
int ret = 0;
for (i = 0; i < BTREE_ID_NR; i++)
struct progress_indicator_state progress;
bch2_progress_init(&progress, c, ~0ULL);
enum btree_id ids[BTREE_ID_NR];
for (unsigned i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
ret = bch2_gc_btree(trans, btree, true);
ret = bch2_gc_btree(trans, &progress, btree, true);
}
printbuf_exit(&buf);

View File

@ -997,7 +997,7 @@ drop_this_key:
}
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
set_btree_node_need_rewrite(b);
}
fsck_err:
@ -2016,7 +2016,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
bch2_journal_pin_drop(&c->journal, &w->journal);
}
static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new;
@ -2024,6 +2024,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
bch2_btree_complete_write(c, b, w);
if (start_time)
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
old = READ_ONCE(b->flags);
do {
new = old;
@ -2054,7 +2057,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_trans *trans = bch2_trans_get(c);
@ -2062,7 +2065,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
/* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
__btree_node_write_done(c, b);
__btree_node_write_done(c, b, start_time);
six_unlock_read(&b->c.lock);
}
@ -2072,6 +2075,7 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
u64 start_time = wbio->start_time;
int ret = 0;
btree_bounce_free(c,
@ -2104,7 +2108,7 @@ static void btree_node_write_work(struct work_struct *work)
}
out:
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
@ -2208,6 +2212,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool validate_before_checksum = false;
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
u64 start_time = local_clock();
int ret;
if (flags & BTREE_WRITE_ALREADY_STARTED)
@ -2416,6 +2421,7 @@ do_write:
wbio->data = data;
wbio->data_bytes = bytes;
wbio->sector_offset = b->written;
wbio->start_time = start_time;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
@ -2443,7 +2449,7 @@ err:
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
__btree_node_write_done(c, b);
__btree_node_write_done(c, b, 0);
}
/*

View File

@ -52,6 +52,7 @@ struct btree_write_bio {
void *data;
unsigned data_bytes;
unsigned sector_offset;
u64 start_time;
struct bch_write_bio wbio;
};

View File

@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
}
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
}
while (1) {
k = __bch2_btree_iter_peek_prev(iter, search_key);
if (unlikely(!k.k))
@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
}
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (ret)
return ERR_PTR(ret);
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
@ -3163,7 +3185,8 @@ out_new_mem:
if (old_bytes) {
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
return ERR_PTR(btree_trans_restart_ip(trans,
BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
out_change_top:
p = trans->mem + trans->mem_top;
@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->last_begin_ip = _RET_IP_;
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
if (trans->restarted) {
trans->restart_count_this_trans++;
} else {
trans->restart_count_this_trans = 0;
}
#endif
trans_set_locked(trans, false);
if (trans->restarted) {

View File

@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err)
return btree_trans_restart_ip(trans, err, _THIS_IP_);
}
static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
{
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
trace_and_count(trans->c, trans_restart_injected, trans, ip);
return btree_trans_restart_ip(trans,
BCH_ERR_transaction_restart_fault_inject, ip);
}
#endif
return 0;
}
bool bch2_btree_node_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
@ -739,7 +751,7 @@ transaction_restart: \
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
_ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
})
#define for_each_btree_key_max_continue(_trans, _iter, \

View File

@ -336,6 +336,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->cached != path->cached);
BUG_ON(i->level != path->level);
BUG_ON(i->btree_id != path->btree_id);
BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
@ -517,69 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
unsigned *btree_id_updates_start)
{
bool trans_trigger_run;
/*
* Running triggers will append more updates to the list of updates as
* we're walking it:
*/
do {
trans_trigger_run = false;
for (unsigned i = *btree_id_updates_start;
i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
i++) {
if (trans->updates[i].btree_id < btree_id) {
*btree_id_updates_start = i;
continue;
}
int ret = run_one_trans_trigger(trans, trans->updates + i);
if (ret < 0)
return ret;
if (ret)
trans_trigger_run = true;
}
} while (trans_trigger_run);
trans_for_each_update(trans, i)
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
i->btree_id == btree_id &&
btree_node_type_has_trans_triggers(i->bkey_type) &&
(!i->insert_trigger_run || !i->overwrite_trigger_run));
return 0;
}
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
unsigned btree_id = 0, btree_id_updates_start = 0;
int ret = 0;
unsigned sort_id_start = 0;
/*
*
* For a given btree, this algorithm runs insert triggers before
* overwrite triggers: this is so that when extents are being moved
* (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
* they are re-added.
*/
for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
if (btree_id == BTREE_ID_alloc)
continue;
while (sort_id_start < trans->nr_updates) {
unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
bool trans_trigger_run;
ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
if (ret)
return ret;
/*
* For a given btree, this algorithm runs insert triggers before
* overwrite triggers: this is so that when extents are being
* moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
* references before they are re-added.
*
* Running triggers will append more updates to the list of
* updates as we're walking it:
*/
do {
trans_trigger_run = false;
for (i = sort_id_start;
i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
i++) {
if (trans->updates[i].sort_order < sort_id) {
sort_id_start = i;
continue;
}
int ret = run_one_trans_trigger(trans, trans->updates + i);
if (ret < 0)
return ret;
if (ret)
trans_trigger_run = true;
}
} while (trans_trigger_run);
sort_id_start = i;
}
btree_id_updates_start = 0;
ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
if (ret)
return ret;
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
@ -999,6 +976,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
bch2_trans_verify_not_unlocked_or_in_restart(trans);
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret))
goto out_reset;
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
goto out_reset;

View File

@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
struct btree_insert_entry {
unsigned flags;
u8 sort_order;
u8 bkey_type;
enum btree_id btree_id:8;
u8 level:4;
@ -509,6 +510,9 @@ struct btree_trans {
bool notrace_relock_fail:1;
enum bch_errcode restarted:16;
u32 restart_count;
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
u32 restart_count_this_trans;
#endif
u64 last_begin_time;
unsigned long last_begin_ip;
@ -850,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree)
return BIT_ULL(btree) & mask;
}
static inline u8 btree_trigger_order(enum btree_id btree)
{
switch (btree) {
case BTREE_ID_alloc:
return U8_MAX;
case BTREE_ID_stripes:
return U8_MAX - 1;
default:
return btree;
}
}
struct btree_root {
struct btree *b;

View File

@ -17,7 +17,7 @@
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
return cmp_int(l->sort_order, r->sort_order) ?:
cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
n = (struct btree_insert_entry) {
.flags = flags,
.sort_order = btree_trigger_order(path->btree_id),
.bkey_type = __btree_node_type(path->level, path->btree_id),
.btree_id = path->btree_id,
.level = path->level,

View File

@ -681,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as)
b = as->old_nodes[i];
bch2_trans_begin(trans);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
seq = b->data ? b->data->keys.seq : 0;
six_unlock_read(&b->c.lock);
bch2_trans_unlock_long(trans);
if (seq == as->old_nodes_seq[i])
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,

View File

@ -282,12 +282,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
(void *) btree_bkey_last(b, t));
ssize_t remaining_space =
__bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
if (b->written + block_sectors(c) <= btree_sectors(c))
return bne;
} else {
if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&

View File

@ -590,11 +590,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (ret)
goto err;
if (!p.ptr.cached) {
ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
if (ret)
goto err;
}
ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
if (ret)
goto err;
}
if (flags & BTREE_TRIGGER_gc) {
@ -674,10 +672,10 @@ err:
return -BCH_ERR_ENOMEM_mark_stripe_ptr;
}
mutex_lock(&c->ec_stripes_heap_lock);
gc_stripe_lock(m);
if (!m || !m->alive) {
mutex_unlock(&c->ec_stripes_heap_lock);
gc_stripe_unlock(m);
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
@ -693,7 +691,7 @@ err:
.type = BCH_DISK_ACCOUNTING_replicas,
};
memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
mutex_unlock(&c->ec_stripes_heap_lock);
gc_stripe_unlock(m);
acc.replicas.data_type = data_type;
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);

View File

@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
/*
* Ugly hack alert:
*
* We need to cram a spinlock in a single byte, because that's what we have left
* in struct bucket, and we care about the size of these - during fsck, we need
* in memory state for every single bucket on every device.
*
* We used to do
* while (xchg(&b->lock, 1) cpu_relax();
* but, it turns out not all architectures support xchg on a single byte.
*
* So now we use bit_spin_lock(), with fun games since we can't burn a whole
* ulong for this - we just need to make sure the lock bit always ends up in the
* first byte.
*/
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define BUCKET_LOCK_BITNR 0
#else
#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
#endif
union ulong_byte_assert {
ulong ulong;
u8 byte;
};
static inline void bucket_unlock(struct bucket *b)
{
BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);

View File

@ -7,6 +7,33 @@
#define BUCKET_JOURNAL_SEQ_BITS 16
/*
* Ugly hack alert:
*
* We need to cram a spinlock in a single byte, because that's what we have left
* in struct bucket, and we care about the size of these - during fsck, we need
* in memory state for every single bucket on every device.
*
* We used to do
* while (xchg(&b->lock, 1) cpu_relax();
* but, it turns out not all architectures support xchg on a single byte.
*
* So now we use bit_spin_lock(), with fun games since we can't burn a whole
* ulong for this - we just need to make sure the lock bit always ends up in the
* first byte.
*/
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define BUCKET_LOCK_BITNR 0
#else
#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
#endif
union ulong_byte_assert {
ulong ulong;
u8 byte;
};
struct bucket {
u8 lock;
u8 gen_valid:1;

View File

@ -315,8 +315,10 @@ static int bch2_data_thread(void *arg)
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
if (ctx->thr.ret == -BCH_ERR_device_offline)
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
else
else {
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
}
return 0;
}

View File

@ -271,8 +271,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc->compressed_size << 9 > c->opts.encoded_extent_max) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "error rewriting existing data: extent too big");
bch2_write_op_error(&buf, op, op->pos.offset,
"extent too big to decompress");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
return -EIO;
@ -283,8 +283,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
if (__bio_uncompress(c, bio, data.b, *crc)) {
if (!c->opts.no_data_io) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "error rewriting existing data: decompression error");
bch2_write_op_error(&buf, op, op->pos.offset,
"decompression error");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}

View File

@ -93,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
return true;
}
static noinline void trace_move_extent_finish2(struct data_update *u,
static noinline void trace_io_move_finish2(struct data_update *u,
struct bkey_i *new,
struct bkey_i *insert)
{
@ -113,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
trace_move_extent_finish(c, buf.buf);
trace_io_move_finish(c, buf.buf);
printbuf_exit(&buf);
}
static void trace_move_extent_fail2(struct data_update *m,
static void trace_io_move_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
@ -128,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m,
struct printbuf buf = PRINTBUF;
unsigned rewrites_found = 0;
if (!trace_move_extent_fail_enabled())
if (!trace_io_move_fail_enabled())
return;
prt_str(&buf, msg);
@ -168,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
}
trace_move_extent_fail(c, buf.buf);
trace_io_move_fail(c, buf.buf);
printbuf_exit(&buf);
}
@ -216,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
new = bkey_i_to_extent(bch2_keylist_front(keys));
if (!bch2_extents_match(k, old)) {
trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
NULL, "no match:");
goto nowork;
}
@ -256,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (m->data_opts.rewrite_ptrs &&
!rewrites_found &&
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
goto nowork;
}
@ -273,7 +273,7 @@ restart_drop_conflicting_replicas:
}
if (!bkey_val_u64s(&new->k)) {
trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
goto nowork;
}
@ -342,6 +342,7 @@ restart_drop_extra_replicas:
struct printbuf buf = PRINTBUF;
prt_str(&buf, "about to insert invalid key in data update path");
prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
@ -386,9 +387,9 @@ restart_drop_extra_replicas:
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
if (trace_move_extent_finish_enabled())
trace_move_extent_finish2(m, &new->k_i, insert);
this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
if (trace_io_move_finish_enabled())
trace_io_move_finish2(m, &new->k_i, insert);
}
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -410,7 +411,7 @@ nowork:
&m->stats->sectors_raced);
}
count_event(c, move_extent_fail);
count_event(c, io_move_fail);
bch2_btree_iter_advance(&iter);
goto next;
@ -438,7 +439,7 @@ void bch2_data_update_read_done(struct data_update *m)
m->op.crc = m->rbio.pick.crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
this_cpu_add(m->op.c->counters[BCH_COUNTER_move_extent_write], m->k.k->k.size);
this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
closure_call(&m->op.cl, bch2_write, NULL, NULL);
}
@ -672,12 +673,46 @@ static bool can_allocate_without_blocking(struct bch_fs *c,
return nr_replicas >= m->op.nr_replicas;
}
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_io_opts *io_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
/* write path might have to decompress data: */
unsigned buf_bytes = 0;
bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
if (!m->bvecs)
return -ENOMEM;
bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
kfree(m->bvecs);
m->bvecs = NULL;
return -ENOMEM;
}
rbio_init(&m->rbio.bio, c, *io_opts, NULL);
m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
return 0;
}
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
struct bch_io_opts *io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
@ -704,7 +739,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
bch2_write_op_init(&m->op, c, *io_opts);
m->op.pos = bkey_start_pos(k.k);
m->op.version = k.k->bversion;
m->op.target = data_opts.target;
@ -715,7 +750,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_data_encoded|
BCH_WRITE_move|
m->data_opts.write_flags;
m->op.compression_opt = io_opts.background_compression;
m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
unsigned durability_have = 0, durability_removing = 0;
@ -753,7 +788,7 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
* If current extent durability is less than io_opts.data_replicas,
@ -786,7 +821,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
if (!ret)
ret = -BCH_ERR_data_update_done_no_writes_needed;
goto out_bkey_buf_exit;
@ -824,33 +859,11 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out_nocow_unlock;
}
/* write path might have to decompress data: */
unsigned buf_bytes = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
if (!m->bvecs)
goto enomem;
bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL))
goto enomem;
rbio_init(&m->rbio.bio, c, io_opts, NULL);
m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
ret = bch2_data_update_bios_init(m, c, io_opts);
if (ret)
goto out_nocow_unlock;
return 0;
enomem:
ret = -ENOMEM;
kfree(m->bvecs);
m->bvecs = NULL;
out_nocow_unlock:
if (c->opts.nocow_enabled)
bkey_nocow_unlock(c, k);

View File

@ -51,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *,
struct bch_io_opts *,
struct data_update_opts *);
int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
struct bch_io_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
struct bch_io_opts, struct data_update_opts,
struct bch_io_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);

View File

@ -13,6 +13,40 @@
#include <linux/dcache.h>
static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
*out_cf = (struct qstr) QSTR_INIT(NULL, 0);
#ifdef CONFIG_UNICODE
unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
int ret = PTR_ERR_OR_ZERO(buf);
if (ret)
return ret;
ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
if (ret <= 0)
return ret;
*out_cf = (struct qstr) QSTR_INIT(buf, ret);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
if (likely(!info->cf_encoding)) {
*out_cf = *str;
return 0;
} else {
return bch2_casefold(trans, info, str, out_cf);
}
}
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
@ -28,13 +62,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
#endif
return bkey_bytes -
offsetof(struct bch_dirent, d_name) -
(d.v->d_casefold
? offsetof(struct bch_dirent, d_cf_name_block.d_names)
: offsetof(struct bch_dirent, d_name)) -
trailing_nuls;
}
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
{
return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
if (d.v->d_casefold) {
unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
} else {
return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
}
}
static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
{
if (d.v->d_casefold) {
unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
} else {
return (struct qstr) QSTR_INIT(NULL, 0);
}
}
static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
{
return d.v->d_casefold
? bch2_dirent_get_casefold_name(d)
: bch2_dirent_get_name(d);
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@ -57,7 +116,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr name = bch2_dirent_get_name(d);
struct qstr name = bch2_dirent_get_lookup_name(d);
return bch2_dirent_hash(info, &name);
}
@ -65,7 +124,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr l_name = bch2_dirent_get_lookup_name(l);
const struct qstr *r_name = _r;
return !qstr_eq(l_name, *r_name);
@ -75,8 +134,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr r_name = bch2_dirent_get_name(r);
const struct qstr l_name = bch2_dirent_get_lookup_name(l);
const struct qstr r_name = bch2_dirent_get_lookup_name(r);
return !qstr_eq(l_name, r_name);
}
@ -104,17 +163,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
unsigned name_block_len = bch2_dirent_name_bytes(d);
struct qstr d_name = bch2_dirent_get_name(d);
struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
int ret = 0;
bkey_fsck_err_on(!d_name.len,
c, dirent_empty_name,
"empty name");
bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
c, dirent_val_too_big,
"value too big (%zu > %u)",
bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
"dirent names exceed bkey size (%d + %d > %d)",
d_name.len, d_cf_name.len, name_block_len);
/*
* Check new keys don't exceed the max length
@ -142,6 +203,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
le64_to_cpu(d.v->d_inum) == d.k->p.inode,
c, dirent_to_itself,
"dirent points to own directory");
if (d.v->d_casefold) {
bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
d_cf_name.len > BCH_NAME_MAX,
c, dirent_cf_name_too_big,
"dirent w/ cf name too big (%u > %u)",
d_cf_name.len, BCH_NAME_MAX);
bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
c, dirent_stray_data_after_cf_name,
"dirent has stray data after cf name's NUL");
}
fsck_err:
return ret;
}
@ -163,15 +236,14 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
subvol_inum dir, u8 type,
const struct qstr *name, u64 dst)
static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
subvol_inum dir,
u8 type,
int name_len, int cf_name_len,
u64 dst)
{
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
if (name->len > BCH_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
BUG_ON(u64s > U8_MAX);
@ -190,14 +262,65 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
dirent->v.d_type = type;
dirent->v.d_unused = 0;
dirent->v.d_casefold = cf_name_len ? 1 : 0;
memcpy(dirent->v.d_name, name->name, name->len);
memset(dirent->v.d_name + name->len, 0,
bkey_val_bytes(&dirent->k) -
offsetof(struct bch_dirent, d_name) -
name->len);
return dirent;
}
EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
const struct qstr *name)
{
EBUG_ON(dirent->v.d_casefold);
memcpy(&dirent->v.d_name[0], name->name, name->len);
memset(&dirent->v.d_name[name->len], 0,
bkey_val_bytes(&dirent->k) -
offsetof(struct bch_dirent, d_name) -
name->len);
}
static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
const struct qstr *name,
const struct qstr *cf_name)
{
EBUG_ON(!dirent->v.d_casefold);
EBUG_ON(!cf_name->len);
dirent->v.d_cf_name_block.d_name_len = name->len;
dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
bkey_val_bytes(&dirent->k) -
offsetof(struct bch_dirent, d_cf_name_block.d_names) -
name->len + cf_name->len);
EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
subvol_inum dir,
u8 type,
const struct qstr *name,
const struct qstr *cf_name,
u64 dst)
{
struct bkey_i_dirent *dirent;
if (name->len > BCH_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
if (IS_ERR(dirent))
return dirent;
if (cf_name)
dirent_init_casefolded_name(dirent, name, cf_name);
else
dirent_init_regular_name(dirent, name);
EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
return dirent;
}
@ -213,7 +336,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
struct bkey_i_dirent *dirent;
int ret;
dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@ -233,16 +356,28 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
u64 *i_size,
enum btree_iter_update_trigger_flags flags)
{
struct bkey_i_dirent *dirent;
int ret;
dirent = dirent_create_key(trans, dir, type, name, dst_inum);
if (hash_info->cf_encoding) {
struct qstr cf_name;
ret = bch2_casefold(trans, hash_info, name, &cf_name);
if (ret)
return ret;
dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum);
} else {
dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum);
}
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
*i_size += bkey_bytes(&dirent->k);
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
@ -275,12 +410,13 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
}
int bch2_dirent_rename(struct btree_trans *trans,
subvol_inum src_dir, struct bch_hash_info *src_hash,
subvol_inum dst_dir, struct bch_hash_info *dst_hash,
subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
struct qstr src_name_lookup, dst_name_lookup;
struct btree_iter src_iter = { NULL };
struct btree_iter dst_iter = { NULL };
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
@ -295,8 +431,11 @@ int bch2_dirent_rename(struct btree_trans *trans,
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
if (ret)
goto out;
old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
src_hash, src_dir, src_name,
src_hash, src_dir, &src_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
@ -308,6 +447,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
goto out;
/* Lookup dst: */
ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
if (ret)
goto out;
if (mode == BCH_RENAME) {
/*
* Note that we're _not_ checking if the target already exists -
@ -315,12 +457,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
* correctness:
*/
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name);
dst_hash, dst_dir, &dst_name_lookup);
if (ret)
goto out;
} else {
old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name,
dst_hash, dst_dir, &dst_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
@ -336,7 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
new_dst = dirent_create_key(trans, dst_dir, 0, dst_name,
dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
@ -346,7 +489,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
new_src = dirent_create_key(trans, src_dir, 0, src_name,
src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
@ -406,6 +550,14 @@ int bch2_dirent_rename(struct btree_trans *trans,
new_src->v.d_type == DT_SUBVOL)
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
if (old_dst.k)
*dst_dir_i_size -= bkey_bytes(old_dst.k);
*src_dir_i_size -= bkey_bytes(old_src.k);
if (mode == BCH_RENAME_EXCHANGE)
*src_dir_i_size += bkey_bytes(&new_src->k);
*dst_dir_i_size += bkey_bytes(&new_dst->k);
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
@ -465,9 +617,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
struct qstr lookup_name;
int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
if (ret)
return ret;
struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir, name, flags);
int ret = bkey_err(k);
hash_info, dir, &lookup_name, flags);
ret = bkey_err(k);
if (ret)
goto err;

View File

@ -25,15 +25,13 @@ struct bch_inode_info;
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
static inline unsigned dirent_val_u64s(unsigned len)
static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
{
return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
sizeof(u64));
}
unsigned bytes = cf_len
? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
: offsetof(struct bch_dirent, d_name) + len;
static inline unsigned int dirent_occupied_size(const struct qstr *name)
{
return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64);
return DIV_ROUND_UP(bytes, sizeof(u64));
}
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
@ -52,7 +50,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
enum btree_iter_update_trigger_flags);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
const struct qstr *, u64, u64 *, u64 *,
enum btree_iter_update_trigger_flags);
static inline unsigned vfs_d_type(unsigned type)
@ -67,8 +65,8 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
subvol_inum, struct bch_hash_info *,
subvol_inum, struct bch_hash_info *,
subvol_inum, struct bch_hash_info *, u64 *,
subvol_inum, struct bch_hash_info *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);

View File

@ -29,9 +29,25 @@ struct bch_dirent {
* Copy of mode bits 12-15 from the target inode - so userspace can get
* the filetype without having to do a stat()
*/
__u8 d_type;
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 d_type:5,
d_unused:2,
d_casefold:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
__u8 d_casefold:1,
d_unused:2,
d_type:5;
#endif
__u8 d_name[];
union {
struct {
__u8 d_pad;
__le16 d_name_len;
__le16 d_cf_name_len;
__u8 d_names[0];
} d_cf_name_block __packed;
__u8 d_name[0];
} __packed;
} __packed __aligned(8);
#define DT_SUBVOL 16

View File

@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *
static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
u64 *v, unsigned nr)
{
percpu_down_read(&c->mark_lock);
struct bch_accounting_mem *acc = &c->accounting;
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &p);
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
percpu_up_read(&c->mark_lock);
}
static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)

View File

@ -20,6 +20,7 @@
#include "io_read.h"
#include "io_write.h"
#include "keylist.h"
#include "lru.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@ -298,10 +299,22 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
if (flags & BTREE_TRIGGER_transactional) {
struct extent_ptr_decoded p = {
.ptr = *ptr,
.crc = bch2_extent_crc_unpack(s.k, NULL),
};
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
(const union bch_extent_entry *) ptr, &bp);
struct bkey_i_alloc_v4 *a =
bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
ret = PTR_ERR_OR_ZERO(a) ?:
__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
!(flags & BTREE_TRIGGER_overwrite));
if (ret)
goto err;
}
if (flags & BTREE_TRIGGER_gc) {
@ -399,6 +412,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
(new_s->nr_blocks != old_s->nr_blocks ||
new_s->nr_redundant != old_s->nr_redundant));
if (flags & BTREE_TRIGGER_transactional) {
int ret = bch2_lru_change(trans,
BCH_LRU_STRIPE_FRAGMENTATION,
idx,
stripe_lru_pos(old_s),
stripe_lru_pos(new_s));
if (ret)
return ret;
}
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
/*
@ -1163,6 +1185,10 @@ err:
return ret;
}
/*
* XXX
* can we kill this and delete stripes from the trigger?
*/
static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
@ -1380,8 +1406,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
if (bp_k.k->type != KEY_TYPE_backpointer)
continue;
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
if (bp.v->btree_id == BTREE_ID_stripes)
continue;
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
bkey_s_c_to_backpointer(bp_k), &last_flushed);
bp, &last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, c);
@ -2503,3 +2533,40 @@ int bch2_fs_ec_init(struct bch_fs *c)
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
struct bkey_s_c k,
struct bkey_buf *last_flushed)
{
if (k.k->type != KEY_TYPE_stripe)
return 0;
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
u64 lru_idx = stripe_lru_pos(s.v);
if (lru_idx) {
int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
k.k->p.offset, lru_idx, k, last_flushed);
if (ret)
return ret;
}
return 0;
}
int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
{
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
}

View File

@ -92,6 +92,31 @@ static inline void stripe_csum_set(struct bch_stripe *s,
memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
}
#define STRIPE_LRU_POS_EMPTY 1
static inline u64 stripe_lru_pos(const struct bch_stripe *s)
{
if (!s)
return 0;
unsigned blocks_empty = 0, blocks_nonempty = 0;
for (unsigned i = 0; i < s->nr_blocks; i++) {
blocks_empty += !stripe_blockcount_get(s, i);
blocks_nonempty += !!stripe_blockcount_get(s, i);
}
/* Will be picked up by the stripe_delete worker */
if (!blocks_nonempty)
return STRIPE_LRU_POS_EMPTY;
if (!blocks_empty)
return 0;
/* invert: more blocks empty = reuse first */
return LRU_TIME_MAX - blocks_empty;
}
static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
@ -132,6 +157,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
m->sectors);
}
static inline void gc_stripe_unlock(struct gc_stripe *s)
{
BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
}
static inline void gc_stripe_lock(struct gc_stripe *s)
{
wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
TASK_UNINTERRUPTIBLE);
}
struct bch_read_bio;
struct ec_stripe_buf {
@ -268,4 +307,6 @@ void bch2_fs_ec_exit(struct bch_fs *);
void bch2_fs_ec_init_early(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
int bch2_check_stripe_to_lru_refs(struct bch_fs *);
#endif /* _BCACHEFS_EC_H */

View File

@ -20,12 +20,11 @@ struct stripe {
};
struct gc_stripe {
u8 lock;
unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 sectors;
u8 nr_blocks;
u8 nr_redundant;
unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 block_sectors[BCH_BKEY_PTRS_MAX];
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];

View File

@ -530,35 +530,59 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
subvol_inum inum, u64 offset)
{
u32 restart_count = trans->restart_count;
int ret = 0;
/* XXX: we don't yet attempt to print paths when we don't know the subvol */
if (inum.subvol)
ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
if (inum.subvol) {
ret = bch2_inum_to_path(trans, inum, out);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
}
if (!inum.subvol || ret)
prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
prt_printf(out, " offset %llu: ", offset);
return trans_was_restarted(trans, restart_count);
}
int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
subvol_inum inum, u64 offset)
{
int ret = bch2_inum_err_msg_trans(trans, out, inum);
prt_printf(out, " offset %llu: ", offset);
return ret;
}
void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
{
bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
}
void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
subvol_inum inum, u64 offset)
{
bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
}
int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bpos pos)
{
struct bch_fs *c = trans->c;
int ret = 0;
if (!bch2_snapshot_is_leaf(c, pos.snapshot))
prt_str(out, "(multiple snapshots) ");
subvol_inum inum = {
.subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot),
.inum = pos.inode,
};
if (inum.subvol) {
ret = bch2_inum_to_path(trans, inum, out);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
}
if (!inum.subvol || ret)
prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot);
prt_printf(out, " offset %llu: ", pos.offset << 8);
return 0;
}
void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
struct bpos pos)
{
bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
}

View File

@ -238,10 +238,11 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
_ret; \
})
int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
#endif /* _BCACHEFS_ERROR_H */

View File

@ -148,87 +148,97 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr
return cmp(a, b, priv);
}
static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
cmp_r_func_t cmp_func, const void *priv,
size_t l, size_t r)
{
return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
base1 + inorder_to_eytzinger1(r, n) * size,
cmp_func, priv);
}
static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
swap_r_func_t swap_func, const void *priv,
size_t l, size_t r)
{
do_swap(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
base1 + inorder_to_eytzinger1(r, n) * size,
size, swap_func, priv);
}
static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
cmp_r_func_t cmp_func,
swap_r_func_t swap_func,
const void *priv)
{
unsigned i, j, k;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
if (is_aligned(base1, size, 8))
swap_func = SWAP_WORDS_64;
else if (is_aligned(base1, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
for (i = n / 2; i >= 1; --i) {
/* Find the sift-down path all the way to the leaves. */
for (j = i; k = j * 2, k < n;)
j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
if (j * 2 == n)
j *= 2;
/* Backtrack to the correct location. */
while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
j /= 2;
/* Shift the element into its correct place. */
for (k = j; j != i;) {
j /= 2;
eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
}
}
/* sort */
for (i = n; i > 1; --i) {
eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
/* Find the sift-down path all the way to the leaves. */
for (j = 1; k = j * 2, k + 1 < i;)
j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
if (j * 2 + 1 == i)
j *= 2;
/* Backtrack to the correct location. */
while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
j /= 2;
/* Shift the element into its correct place. */
for (k = j; j > 1;) {
j /= 2;
eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
}
}
}
void eytzinger0_sort_r(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func,
swap_r_func_t swap_func,
const void *priv)
{
int i, j, k;
void *base1 = base - size;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
if (is_aligned(base, size, 8))
swap_func = SWAP_WORDS_64;
else if (is_aligned(base, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
/* Find the sift-down path all the way to the leaves. */
for (j = i; k = j * 2 + 1, k + 1 < n;)
j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
if (j * 2 + 2 == n)
j = j * 2 + 1;
/* Backtrack to the correct location. */
while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
j = (j - 1) / 2;
/* Shift the element into its correct place. */
for (k = j; j != i;) {
j = (j - 1) / 2;
eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
/* Find the sift-down path all the way to the leaves. */
for (j = 0; k = j * 2 + 1, k + 1 < i;)
j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
if (j * 2 + 2 == i)
j = j * 2 + 1;
/* Backtrack to the correct location. */
while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
j = (j - 1) / 2;
/* Shift the element into its correct place. */
for (k = j; j;) {
j = (j - 1) / 2;
eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
}
}
return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
}
void eytzinger0_sort(void *base, size_t n, size_t size,

View File

@ -6,6 +6,7 @@
#include <linux/log2.h>
#ifdef EYTZINGER_DEBUG
#include <linux/bug.h>
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size)
return rounddown_pow_of_two(size + 1) - 1;
}
/*
* eytzinger1_next() and eytzinger1_prev() have the nice properties that
*
* eytzinger1_next(0) == eytzinger1_first())
* eytzinger1_prev(0) == eytzinger1_last())
*
* eytzinger1_prev(eytzinger1_first()) == 0
* eytzinger1_next(eytzinger1_last()) == 0
*/
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
EYTZINGER_BUG_ON(i > size);
EYTZINGER_BUG_ON(i == 0 || i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
i <<= __fls(size + 1) - __fls(i);
i <<= __fls(size) - __fls(i);
i >>= i > size;
} else {
i >>= ffz(i) + 1;
@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
EYTZINGER_BUG_ON(i > size);
EYTZINGER_BUG_ON(i == 0 || i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
i <<= __fls(size + 1) - __fls(i);
i <<= __fls(size) - __fls(i);
i -= 1;
i >>= i > size;
} else {
@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
#define eytzinger0_for_each_prev(_i, _size) \
for (unsigned (_i) = eytzinger0_last((_size)); \
(_i) != -1; \
(_i) = eytzinger0_prev((_i), (_size)))
/* return greatest node <= @search, or -1 if not found */
static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
void *base1 = base - size;
unsigned n = 1;
if (!nr)
return -1;
do {
i = n;
n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr);
if (n & 1) {
/*
* @i was greater than @search, return previous node:
*
* if @i was leftmost/smallest element,
* eytzinger0_prev(eytzinger0_first())) returns -1, as expected
*/
return eytzinger0_prev(i, nr);
} else {
return i;
}
while (n <= nr)
n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
n >>= __ffs(n) + 1;
return n - 1;
}
/* return smallest node > @search, or -1 if not found */
static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
void *base1 = base - size;
unsigned n = 1;
/*
* if eytitzinger0_find_le() returned -1 - no element was <= search - we
* want to return the first element; next/prev identities mean this work
* as expected
*
* similarly if find_le() returns last element, we should return -1;
* identities mean this all works out:
*/
return eytzinger0_next(idx, nr);
while (n <= nr)
n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
n >>= __ffs(n + 1) + 1;
return n - 1;
}
/* return smallest node >= @search, or -1 if not found */
static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
void *base1 = base - size;
unsigned n = 1;
if (idx < nr && !cmp(base + idx * size, search))
return idx;
return eytzinger0_next(idx, nr);
while (n <= nr)
n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
n >>= __ffs(n + 1) + 1;
return n - 1;
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
size_t _size = (size); \
void *_base1 = (void *)(base) - _size; \
const void *_search = (search); \
size_t _nr = (nr); \
size_t _size = (size); \
size_t _i = 0; \
size_t _i = 1; \
int _res; \
\
while (_i < _nr && \
(_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
while (_i <= _nr && \
(_res = _cmp(_search, _base1 + _i * _size))) \
_i = eytzinger1_child(_i, _res > 0); \
_i - 1; \
})
void eytzinger0_sort_r(void *, size_t, size_t,

View File

@ -47,6 +47,10 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
/* Inherit casefold state from parent. */
if (S_ISDIR(mode))
new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded;
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
@ -152,18 +156,15 @@ int bch2_create_trans(struct btree_trans *trans,
if (is_subdir_for_nlink(new_inode))
dir_u->bi_nlink++;
dir_u->bi_mtime = dir_u->bi_ctime = now;
dir_u->bi_size += dirent_occupied_size(name);
ret = bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
ret = bch2_dirent_create(trans, dir, &dir_hash,
dir_type,
name,
dir_target,
&dir_offset,
STR_HASH_must_create|BTREE_ITER_with_updates);
ret = bch2_dirent_create(trans, dir, &dir_hash,
dir_type,
name,
dir_target,
&dir_offset,
&dir_u->bi_size,
STR_HASH_must_create|BTREE_ITER_with_updates) ?:
bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
@ -221,13 +222,14 @@ int bch2_link_trans(struct btree_trans *trans,
}
dir_u->bi_mtime = dir_u->bi_ctime = now;
dir_u->bi_size += dirent_occupied_size(name);
dir_hash = bch2_hash_info_init(c, dir_u);
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum.inum, &dir_offset,
name, inum.inum,
&dir_offset,
&dir_u->bi_size,
STR_HASH_must_create);
if (ret)
goto err;
@ -266,8 +268,16 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_intent);
struct bkey_s_c dirent_k =
bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
&dir_hash, dir, name, BTREE_ITER_intent);
ret = bkey_err(dirent_k);
if (ret)
goto err;
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(dirent_k), &inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
goto err;
@ -324,7 +334,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
dir_u->bi_size -= dirent_occupied_size(name);
dir_u->bi_size -= bkey_bytes(dirent_k.k);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
@ -420,8 +430,8 @@ int bch2_rename_trans(struct btree_trans *trans,
}
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
src_dir, &src_hash, &src_dir_u->bi_size,
dst_dir, &dst_hash, &dst_dir_u->bi_size,
src_name, &src_inum, &src_offset,
dst_name, &dst_inum, &dst_offset,
mode);
@ -463,14 +473,6 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
if (mode == BCH_RENAME) {
src_dir_u->bi_size -= dirent_occupied_size(src_name);
dst_dir_u->bi_size += dirent_occupied_size(dst_name);
}
if (mode == BCH_RENAME_OVERWRITE)
src_dir_u->bi_size -= dirent_occupied_size(src_name);
if (src_inode_u->bi_parent_subvol)
src_inode_u->bi_parent_subvol = dst_dir.subvol;

View File

@ -110,11 +110,18 @@ static int readpage_bio_extend(struct btree_trans *trans,
if (!get_more)
break;
unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
break;
unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio))
break;
folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
if (!folio)
break;
@ -230,7 +237,8 @@ err:
if (ret) {
struct printbuf buf = PRINTBUF;
bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
prt_printf(&buf, "read error %i from btree lookup", ret);
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);

View File

@ -54,6 +54,31 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
(newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
return -EINVAL;
if ((newflags ^ oldflags) & BCH_INODE_casefolded) {
#ifdef CONFIG_UNICODE
int ret = 0;
/* Not supported on individual files. */
if (!S_ISDIR(bi->bi_mode))
return -EOPNOTSUPP;
/*
* Make sure the dir is empty, as otherwise we'd need to
* rehash everything and update the dirent keys.
*/
ret = bch2_empty_dir_trans(trans, inode_inum(inode));
if (ret < 0)
return ret;
if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding))
return -EOPNOTSUPP;
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
#else
printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
return -EOPNOTSUPP;
#endif
}
if (s->set_projinherit) {
bi->bi_fields_set &= ~(1 << Inode_opt_project);
bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);

View File

@ -6,19 +6,21 @@
/* bcachefs inode flags -> vfs inode flags: */
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
[__BCH_INODE_sync] = S_SYNC,
[__BCH_INODE_immutable] = S_IMMUTABLE,
[__BCH_INODE_append] = S_APPEND,
[__BCH_INODE_noatime] = S_NOATIME,
[__BCH_INODE_sync] = S_SYNC,
[__BCH_INODE_immutable] = S_IMMUTABLE,
[__BCH_INODE_append] = S_APPEND,
[__BCH_INODE_noatime] = S_NOATIME,
[__BCH_INODE_casefolded] = S_CASEFOLD,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
[__BCH_INODE_sync] = FS_SYNC_FL,
[__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
[__BCH_INODE_append] = FS_APPEND_FL,
[__BCH_INODE_nodump] = FS_NODUMP_FL,
[__BCH_INODE_noatime] = FS_NOATIME_FL,
[__BCH_INODE_sync] = FS_SYNC_FL,
[__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
[__BCH_INODE_append] = FS_APPEND_FL,
[__BCH_INODE_nodump] = FS_NODUMP_FL,
[__BCH_INODE_noatime] = FS_NOATIME_FL,
[__BCH_INODE_casefolded] = FS_CASEFOLD_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */

View File

@ -698,6 +698,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
if (IS_ERR(inode))
inode = NULL;
#ifdef CONFIG_UNICODE
if (!inode && IS_CASEFOLDED(vdir)) {
/*
* Do not cache a negative dentry in casefolded directories
* as it would need to be invalidated in the following situation:
* - Lookup file "blAH" in a casefolded directory
* - Creation of file "BLAH" in a casefolded directory
* - Lookup file "blAH" in a casefolded directory
* which would fail if we had a negative dentry.
*
* We should come back to this when VFS has a method to handle
* this edgecase.
*/
return NULL;
}
#endif
return d_splice_alias(&inode->v, dentry);
}
@ -1802,7 +1819,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
break;
}
mapping_set_large_folios(inode->v.i_mapping);
mapping_set_folio_min_order(inode->v.i_mapping,
get_order(trans->c->opts.block_size));
}
static void bch2_free_inode(struct inode *vinode)

View File

@ -823,6 +823,7 @@ struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
u64 count;
u64 i_size;
};
struct inode_walker {
@ -910,8 +911,9 @@ found:
if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
new.snapshot = k.k->p.snapshot;
new.count = 0;
new.snapshot = k.k->p.snapshot;
new.count = 0;
new.i_size = 0;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
@ -1116,37 +1118,6 @@ err:
return ret;
}
static int check_directory_size(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
struct bkey_s_c inode_k, bool *write_inode)
{
struct btree_iter iter;
struct bkey_s_c k;
u64 new_size = 0;
int ret;
for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
POS(inode_k.k->p.offset, U64_MAX),
0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
struct qstr name = bch2_dirent_get_name(dirent);
new_size += dirent_occupied_size(&name);
}
bch2_trans_iter_exit(trans, &iter);
if (!ret && inode_u->bi_size != new_size) {
inode_u->bi_size = new_size;
*write_inode = true;
}
return ret;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_journal_seq = journal_cur_seq(&c->journal);
do_update = true;
}
if (S_ISDIR(u.bi_mode)) {
ret = check_directory_size(trans, &u, k, &do_update);
fsck_err_on(ret,
trans, directory_size_mismatch,
"directory inode %llu:%u with the mismatch directory size",
u.bi_inum, k.k->p.snapshot);
ret = 0;
}
do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u);
@ -2017,10 +1978,31 @@ fsck_err:
return ret;
}
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
int ret = 0;
darray_for_each(w->inodes, i)
if (fsck_err_on(i->inode.bi_size != i->i_size,
trans, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_size: got %llu, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) {
i->inode.bi_size = i->i_size;
ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
break;
}
fsck_err:
bch_err_fn(c, ret);
return ret;
}
static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
check_dir_i_size_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}
@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
ret = check_subdir_count(trans, dir);
ret = check_subdir_dirents_count(trans, dir);
if (ret)
goto err;
}
@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
if (d.v->d_type == DT_DIR)
for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
if (d.v->d_type == DT_DIR)
i->count++;
i->i_size += bkey_bytes(d.k);
}
out:
err:
fsck_err:

View File

@ -137,7 +137,8 @@ enum inode_opt_id {
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8) \
x(has_child_snapshot, 9)
x(has_child_snapshot, 9) \
x(casefolded, 10)
/* bits 20+ reserved for packed fields below: */

View File

@ -115,7 +115,8 @@ err:
bch2_increment_clock(c, sectors_allocated, WRITE);
if (should_print_err(ret)) {
struct printbuf buf = PRINTBUF;
bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);

View File

@ -181,7 +181,7 @@ static noinline void promote_start(struct bch_read_bio *rbio)
{
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
trace_and_count(op->write.op.c, read_promote, &rbio->bio);
trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
INIT_WORK(&op->work, promote_start_work);
queue_work(rbio->c->write_ref_wq, &op->work);
@ -243,7 +243,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
orig->opts,
&orig->opts,
update_opts,
btree_id, k);
/*
@ -320,7 +320,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
*read_full = promote_full;
return promote;
nopromote:
trace_read_nopromote(c, ret);
trace_io_read_nopromote(c, ret);
return NULL;
}
@ -329,9 +329,10 @@ nopromote:
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_read_bio *rbio, struct bpos read_pos)
{
return bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { rbio->subvol, read_pos.inode },
read_pos.offset << 9);
return lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { rbio->subvol, read_pos.inode },
read_pos.offset << 9));
}
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
@ -463,7 +464,9 @@ static void bch2_rbio_retry(struct work_struct *work)
};
struct bch_io_failures failed = { .nr = 0 };
trace_and_count(c, read_retry, &rbio->bio);
trace_io_read_retry(&rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
bvec_iter_sectors(rbio->bvec_iter));
if (rbio->retry == READ_RETRY_AVOID)
bch2_mark_io_failure(&failed, &rbio->pick);
@ -802,7 +805,7 @@ static void bch2_read_endio(struct bio *bio)
if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
(ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
trace_and_count(c, read_reuse_race, &rbio->bio);
trace_and_count(c, io_read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_retry_if_stale)
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
@ -891,6 +894,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
swap(iter.bi_size, bytes);
bio_advance_iter(&orig->bio, &iter, bytes);
zero_fill_bio_iter(&orig->bio, iter);
this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
bvec_iter_sectors(iter));
goto out_read_done;
}
retry_pick:
@ -979,6 +984,7 @@ retry_pick:
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
BUG();
if (ca)
percpu_ref_put(&ca->io_ref);
goto hole;
@ -1077,10 +1083,12 @@ retry_pick:
rbio->bio.bi_end_io = bch2_read_endio;
if (rbio->bounce)
trace_and_count(c, read_bounce, &rbio->bio);
trace_and_count(c, io_read_bounce, &rbio->bio);
if (!(flags & BCH_READ_data_update))
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
else
this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
/*
@ -1093,7 +1101,7 @@ retry_pick:
if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
bio_inc_remaining(&orig->bio);
trace_and_count(c, read_split, &orig->bio);
trace_and_count(c, io_read_split, &orig->bio);
}
if (!rbio->pick.idx) {
@ -1170,6 +1178,8 @@ err:
goto out_read_done;
hole:
this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
bvec_iter_sectors(iter));
/*
* won't normally happen in the BCH_READ_data_update
* (bch2_move_extent()) path, but if we retry and the extent we wanted
@ -1270,7 +1280,9 @@ err:
if (ret) {
struct printbuf buf = PRINTBUF;
bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum,
bvec_iter.bi_sector << 9));
prt_printf(&buf, "read error %i from btree lookup", ret);
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);

View File

@ -396,19 +396,61 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
u64 offset)
void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_write_op *op, u64 offset, const char *fmt, ...)
{
bch2_inum_offset_err_msg(op->c, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
prt_printf(out, "write error%s: ",
op->flags & BCH_WRITE_move ? "(internal move)" : "");
if (op->subvol)
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9));
else {
struct bpos pos = op->pos;
pos.offset = offset;
lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
}
prt_str(out, "write error: ");
va_list args;
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op);
prt_printf(out, "\n from internal move ");
bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
}
}
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset,
const char *fmt, ...)
{
__bch2_write_op_error(out, op, op->pos.offset);
if (op->subvol)
bch2_inum_offset_err_msg(op->c, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
else {
struct bpos pos = op->pos;
pos.offset = offset;
bch2_inum_snap_offset_err_msg(op->c, out, pos);
}
prt_str(out, "write error: ");
va_list args;
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op);
prt_printf(out, "\n from internal move ");
bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
}
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@ -551,8 +593,8 @@ static void __bch2_write_index(struct bch_write_op *op)
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
struct printbuf buf = PRINTBUF;
__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k),
"btree update error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@ -1104,8 +1146,8 @@ do_write:
csum_err:
{
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
bch2_write_op_error(&buf, op, op->pos.offset,
"error verifying existing checksum while rewriting existing data (memory corruption?)");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@ -1201,8 +1243,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
struct printbuf buf = PRINTBUF;
__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k),
"btree update error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@ -1369,8 +1411,8 @@ err:
if (ret) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
bch2_write_op_error(&buf, op, op->pos.offset,
"%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
op->error = ret;
@ -1492,8 +1534,8 @@ err:
if (unlikely(ret < 0)) {
if (!(op->flags & BCH_WRITE_alloc_nowait)) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
bch2_write_op_error(&buf, op, op->pos.offset,
"%s(): %s", __func__, bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@ -1624,8 +1666,8 @@ CLOSURE_CALLBACK(bch2_write)
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "misaligned write");
bch2_write_op_error(&buf, op, op->pos.offset,
"misaligned write");
printbuf_exit(&buf);
op->error = -EIO;
goto err;

View File

@ -20,7 +20,13 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
__printf(5, 6)
void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_write_op *op, u64, const char *, ...);
__printf(4, 5)
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64,
const char *, ...);
#define BCH_WRITE_FLAGS() \
x(alloc_nowait) \

View File

@ -56,11 +56,18 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
prt_printf(out, "seq:\t%llu\n", seq);
printbuf_indent_add(out, 2);
prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
if (!buf->write_started)
prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
prt_printf(out, "size:\t");
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
struct closure *cl = &buf->io;
int r = atomic_read(&cl->remaining);
prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
if (buf->data) {
prt_printf(out, "size:\t");
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
}
prt_printf(out, "expires:\t");
prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
@ -87,6 +94,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
{
lockdep_assert_held(&j->lock);
out->atomic++;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
@ -95,6 +105,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq++)
bch2_journal_buf_to_text(out, j, seq);
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
--out->atomic;
}
static inline struct journal_buf *
@ -104,10 +116,8 @@ journal_seq_to_buf(struct journal *j, u64 seq)
EBUG_ON(seq > journal_cur_seq(j));
if (journal_seq_unwritten(j, seq)) {
if (journal_seq_unwritten(j, seq))
buf = j->buf + (seq & JOURNAL_BUF_MASK);
EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
}
return buf;
}
@ -195,7 +205,8 @@ void bch2_journal_do_writes(struct journal *j)
if (w->write_started)
continue;
if (!journal_state_count(j->reservations, idx)) {
if (!journal_state_seq_count(j, j->reservations, seq)) {
j->seq_write_started = seq;
w->write_started = true;
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
}
@ -391,6 +402,9 @@ static int journal_entry_open(struct journal *j)
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
return JOURNAL_ERR_max_in_flight;
if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
return JOURNAL_ERR_max_open;
if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
bch_err(c, "cannot start: journal seq overflow");
if (bch2_fs_emergency_read_only_locked(c))
@ -398,8 +412,16 @@ static int journal_entry_open(struct journal *j)
return JOURNAL_ERR_insufficient_devices; /* -EROFS */
}
if (!j->free_buf && !buf->data)
return JOURNAL_ERR_enomem; /* will retry after write completion frees up a buf */
BUG_ON(!j->cur_entry_sectors);
if (!buf->data) {
swap(buf->data, j->free_buf);
swap(buf->buf_size, j->free_buf_size);
}
buf->expires =
(journal_cur_seq(j) == j->flushed_seq_ondisk
? jiffies
@ -464,7 +486,7 @@ static int journal_entry_open(struct journal *j)
new.idx++;
BUG_ON(journal_state_count(new, new.idx));
BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
journal_state_inc(&new);
@ -514,6 +536,33 @@ static void journal_write_work(struct work_struct *work)
spin_unlock(&j->lock);
}
static void journal_buf_prealloc(struct journal *j)
{
if (j->free_buf &&
j->free_buf_size >= j->buf_size_want)
return;
unsigned buf_size = j->buf_size_want;
spin_unlock(&j->lock);
void *buf = kvmalloc(buf_size, GFP_NOFS);
spin_lock(&j->lock);
if (buf &&
(!j->free_buf ||
buf_size > j->free_buf_size)) {
swap(buf, j->free_buf);
swap(buf_size, j->free_buf_size);
}
if (unlikely(buf)) {
spin_unlock(&j->lock);
/* kvfree can sleep */
kvfree(buf);
spin_lock(&j->lock);
}
}
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned flags)
{
@ -544,6 +593,8 @@ retry:
spin_lock(&j->lock);
journal_buf_prealloc(j);
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
@ -571,20 +622,43 @@ unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
out:
if (likely(!ret))
return 0;
if (ret == JOURNAL_ERR_retry)
goto retry;
if (!ret)
return 0;
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
if (ret == JOURNAL_ERR_max_in_flight &&
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_printbuf_make_room(&buf, 4096);
spin_lock(&j->lock);
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
bch2_journal_bufs_to_text(&buf, j);
spin_unlock(&j->lock);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
count_event(c, journal_entry_full);
}
if (ret == JOURNAL_ERR_max_open &&
track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_printbuf_make_room(&buf, 4096);
spin_lock(&j->lock);
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
bch2_journal_bufs_to_text(&buf, j);
spin_unlock(&j->lock);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
count_event(c, journal_entry_full);
@ -907,7 +981,7 @@ int bch2_journal_meta(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
return -EROFS;
return -BCH_ERR_erofs_no_writes;
int ret = __bch2_journal_meta(j);
bch2_write_ref_put(c, BCH_WRITE_REF_journal);
@ -951,7 +1025,8 @@ static void __bch2_journal_block(struct journal *j)
new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
}
}
@ -992,7 +1067,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
*blocked = true;
}
ret = journal_state_count(s, idx) > open
ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
? ERR_PTR(-EAGAIN)
: buf;
break;
@ -1342,6 +1417,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->flushed_seq_ondisk = cur_seq - 1;
j->seq_write_started = cur_seq - 1;
j->seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
@ -1474,6 +1550,7 @@ void bch2_fs_journal_exit(struct journal *j)
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
kvfree(j->buf[i].data);
kvfree(j->free_buf);
free_fifo(&j->pin);
}
@ -1500,13 +1577,13 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
if (!j->free_buf)
return -BCH_ERR_ENOMEM_journal_buf;
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
@ -1556,6 +1633,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "average write size:\t");
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
prt_newline(out);
prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);

View File

@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j)
closure_wake_up(&j->async_wait);
}
static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
return j->buf + j->reservations.idx;
}
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j)
return j->seq_ondisk + 1;
}
static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
unsigned idx = (journal_cur_seq(j) &
JOURNAL_BUF_MASK &
~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
return j->buf + idx;
}
static inline int journal_state_count(union journal_res_state s, int idx)
{
switch (idx) {
@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx)
BUG();
}
static inline int journal_state_seq_count(struct journal *j,
union journal_res_state s, u64 seq)
{
if (journal_cur_seq(j) - seq <= JOURNAL_STATE_BUF_NR)
return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
else
return 0;
}
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
@ -269,7 +282,7 @@ void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
{
unsigned idx = seq & JOURNAL_BUF_MASK;
unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
union journal_res_state s;
s = journal_state_buf_put(j, idx);
@ -279,7 +292,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
{
unsigned idx = seq & JOURNAL_BUF_MASK;
unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
union journal_res_state s;
s = journal_state_buf_put(j, idx);
@ -365,9 +378,7 @@ static inline int journal_res_get_fast(struct journal *j,
res->ref = true;
res->offset = old.cur_entry_offset;
res->seq = journal_cur_seq(j);
res->seq -= (res->seq - old.idx) & JOURNAL_BUF_MASK;
EBUG_ON(res->seq != le64_to_cpu(j->buf[old.idx].data->seq));
res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
return 1;
}
@ -394,6 +405,7 @@ out:
(flags & JOURNAL_RES_GET_NONBLOCK) != 0,
NULL, _THIS_IP_);
EBUG_ON(!res->ref);
BUG_ON(!res->seq);
}
return 0;
}

View File

@ -1515,7 +1515,7 @@ static void __journal_write_alloc(struct journal *j,
* @j: journal object
* @w: journal buf (entry to be written)
*
* Returns: 0 on success, or -EROFS on failure
* Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
*/
static int journal_write_alloc(struct journal *j, struct journal_buf *w)
{
@ -1624,8 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done)
} else {
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
w->devs_written);
if (bch2_mark_replicas(c, &replicas.e))
err = -EIO;
err = bch2_mark_replicas(c, &replicas.e);
}
if (err)
@ -1640,6 +1639,21 @@ static CLOSURE_CALLBACK(journal_write_done)
j->err_seq = seq;
w->write_done = true;
if (!j->free_buf || j->free_buf_size < w->buf_size) {
swap(j->free_buf, w->data);
swap(j->free_buf_size, w->buf_size);
}
if (w->data) {
void *buf = w->data;
w->data = NULL;
w->buf_size = 0;
spin_unlock(&j->lock);
kvfree(buf);
spin_lock(&j->lock);
}
bool completed = false;
for (seq = journal_last_unwritten_seq(j);
@ -1649,7 +1663,7 @@ static CLOSURE_CALLBACK(journal_write_done)
if (!w->write_done)
break;
if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
if (!j->err_seq && !w->noflush) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
@ -1973,7 +1987,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
* write anything at all.
*/
if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
return -EIO;
return error;
if (error ||
w->noflush ||

View File

@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
fn == bch2_btree_node_flush1)
return JOURNAL_PIN_TYPE_btree;
else if (fn == bch2_btree_key_cache_journal_flush)
fn == bch2_btree_node_flush1) {
unsigned idx = fn == bch2_btree_node_flush1;
struct btree *b = container_of(pin, struct btree, writes[idx].journal);
return JOURNAL_PIN_TYPE_btree0 - b->c.level;
} else if (fn == bch2_btree_key_cache_journal_flush)
return JOURNAL_PIN_TYPE_key_cache;
else
return JOURNAL_PIN_TYPE_other;
@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j,
bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_key_cache)|
BIT(JOURNAL_PIN_TYPE_other))) {
*did_work = true;
goto unlock;
}
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_btree))) {
*did_work = true;
goto unlock;
}
for (int type = JOURNAL_PIN_TYPE_NR - 1;
type >= 0;
--type)
if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
*did_work = true;
goto unlock;
}
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);

View File

@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
unsigned i;
for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
src = bl->start;
eytzinger0_for_each(i, nr) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
*dst++ = *src;
src++;
}
unsigned new_nr = dst - bl->start;

View File

@ -12,7 +12,11 @@
/* btree write buffer steals 8 bits for its own purposes: */
#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
#define JOURNAL_BUF_BITS 2
#define JOURNAL_STATE_BUF_BITS 2
#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
#define JOURNAL_BUF_BITS 4
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
@ -53,7 +57,10 @@ struct journal_buf {
*/
enum journal_pin_type {
JOURNAL_PIN_TYPE_btree,
JOURNAL_PIN_TYPE_btree3,
JOURNAL_PIN_TYPE_btree2,
JOURNAL_PIN_TYPE_btree1,
JOURNAL_PIN_TYPE_btree0,
JOURNAL_PIN_TYPE_key_cache,
JOURNAL_PIN_TYPE_other,
JOURNAL_PIN_TYPE_NR,
@ -150,9 +157,11 @@ enum journal_flags {
x(retry) \
x(blocked) \
x(max_in_flight) \
x(max_open) \
x(journal_full) \
x(journal_pin_full) \
x(journal_stuck) \
x(enomem) \
x(insufficient_devices)
enum journal_errors {
@ -215,6 +224,8 @@ struct journal {
* other is possibly being written out.
*/
struct journal_buf buf[JOURNAL_BUF_NR];
void *free_buf;
unsigned free_buf_size;
spinlock_t lock;
@ -232,6 +243,7 @@ struct journal {
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
u64 seq_write_started;
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;

View File

@ -6,6 +6,7 @@
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "ec.h"
#include "error.h"
#include "lru.h"
#include "recovery.h"
@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
}
int bch2_lru_change(struct btree_trans *trans,
u16 lru_id, u64 dev_bucket,
u64 old_time, u64 new_time)
int __bch2_lru_change(struct btree_trans *trans,
u16 lru_id, u64 dev_bucket,
u64 old_time, u64 new_time)
{
if (old_time == new_time)
return 0;
@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = {
};
int bch2_lru_check_set(struct btree_trans *trans,
u16 lru_id, u64 time,
u16 lru_id,
u64 dev_bucket,
u64 time,
struct bkey_s_c referring_k,
struct bkey_buf *last_flushed)
{
@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
struct btree_iter lru_iter;
struct bkey_s_c lru_k =
bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
lru_pos(lru_id,
bucket_to_u64(referring_k.k->p),
time), 0);
lru_pos(lru_id, dev_bucket, time), 0);
int ret = bkey_err(lru_k);
if (ret)
return ret;
@ -104,7 +105,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
" %s",
bch2_lru_types[lru_type(lru_k)],
(bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
if (ret)
goto err;
}
@ -116,49 +117,73 @@ fsck_err:
return ret;
}
static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
{
enum bch_lru_type type = lru_type(lru_k);
switch (type) {
case BCH_LRU_read:
case BCH_LRU_fragmentation:
return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
case BCH_LRU_stripes:
return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
default:
BUG();
}
}
static u64 bkey_lru_type_idx(struct bch_fs *c,
enum bch_lru_type type,
struct bkey_s_c k)
{
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
switch (type) {
case BCH_LRU_read:
a = bch2_alloc_to_v4(k, &a_convert);
return alloc_lru_idx_read(*a);
case BCH_LRU_fragmentation: {
a = bch2_alloc_to_v4(k, &a_convert);
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
u64 idx = ca
? alloc_lru_idx_fragmentation(*a, ca)
: 0;
rcu_read_unlock();
return idx;
}
case BCH_LRU_stripes:
return k.k->type == KEY_TYPE_stripe
? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
: 0;
default:
BUG();
}
}
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
enum bch_lru_type type = lru_type(lru_k);
struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
u64 idx;
int ret;
struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
struct bbpos bp = lru_pos_to_bp(lru_k);
if (fsck_err_on(!ca,
trans, lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
ret = bkey_err(k);
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
int ret = bkey_err(k);
if (ret)
goto err;
a = bch2_alloc_to_v4(k, &a_convert);
enum bch_lru_type type = lru_type(lru_k);
u64 idx = bkey_lru_type_idx(c, type, k);
switch (type) {
case BCH_LRU_read:
idx = alloc_lru_idx_read(*a);
break;
case BCH_LRU_fragmentation:
idx = alloc_lru_idx_fragmentation(*a, ca);
break;
}
if (lru_k.k->type != KEY_TYPE_set ||
lru_pos_time(lru_k.k->p) != idx) {
if (lru_pos_time(lru_k.k->p) != idx) {
ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
if (ret)
goto err;
@ -176,7 +201,6 @@ static int bch2_check_lru_key(struct btree_trans *trans,
err:
fsck_err:
bch2_trans_iter_exit(trans, &iter);
bch2_dev_put(ca);
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;

View File

@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
if (lru_id == BCH_LRU_FRAGMENTATION_START)
switch (lru_id) {
case BCH_LRU_BUCKET_FRAGMENTATION:
return BCH_LRU_fragmentation;
return BCH_LRU_read;
case BCH_LRU_STRIPE_FRAGMENTATION:
return BCH_LRU_stripes;
default:
return BCH_LRU_read;
}
}
int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
static inline int bch2_lru_change(struct btree_trans *trans,
u16 lru_id, u64 dev_bucket,
u64 old_time, u64 new_time)
{
return old_time != new_time
? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
: 0;
}
struct bkey_buf;
int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
int bch2_check_lrus(struct bch_fs *);

View File

@ -9,7 +9,8 @@ struct bch_lru {
#define BCH_LRU_TYPES() \
x(read) \
x(fragmentation)
x(fragmentation) \
x(stripes)
enum bch_lru_type {
#define x(n) BCH_LRU_##n,
@ -17,7 +18,8 @@ enum bch_lru_type {
#undef x
};
#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
#define LRU_TIME_BITS 48
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)

View File

@ -15,6 +15,7 @@
#include "keylist.h"
#include "migrate.h"
#include "move.h"
#include "progress.h"
#include "replicas.h"
#include "super-io.h"
@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
return 0;
}
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
static int bch2_dev_usrdata_drop(struct bch_fs *c,
struct progress_indicator_state *progress,
unsigned dev_idx, int flags)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id id;
@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
}));
if (ret)
break;
}
@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
return ret;
}
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
static int bch2_dev_metadata_drop(struct bch_fs *c,
struct progress_indicator_state *progress,
unsigned dev_idx, int flags)
{
struct btree_trans *trans;
struct btree_iter iter;
@ -125,6 +132,8 @@ retry:
while (bch2_trans_begin(trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
@ -169,6 +178,11 @@ err:
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
bch2_dev_metadata_drop(c, dev_idx, flags);
struct progress_indicator_state progress;
bch2_progress_init(&progress, c,
BIT_ULL(BTREE_ID_extents)|
BIT_ULL(BTREE_ID_reflink));
return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
}

View File

@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
if (trace_io_move_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
trace_io_move(c, buf.buf);
printbuf_exit(&buf);
}
}
static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
if (trace_move_extent_read_enabled()) {
if (trace_io_move_read_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
trace_move_extent_read(c, buf.buf);
trace_io_move_read(c, buf.buf);
printbuf_exit(&buf);
}
}
@ -89,7 +89,12 @@ static void move_free(struct moving_io *io)
wake_up(&ctxt->wait);
mutex_unlock(&ctxt->lock);
bch2_data_update_exit(&io->write);
if (!io->write.data_opts.scrub) {
bch2_data_update_exit(&io->write);
} else {
bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
kfree(io->write.bvecs);
}
kfree(io);
}
@ -127,12 +132,12 @@ static void move_write(struct moving_io *io)
return;
}
if (trace_move_extent_write_enabled()) {
if (trace_io_move_write_enabled()) {
struct bch_fs *c = io->write.op.c;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
trace_move_extent_write(c, buf.buf);
trace_io_move_write(c, buf.buf);
printbuf_exit(&buf);
}
@ -268,7 +273,8 @@ int bch2_move_extent(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
int ret = -ENOMEM;
trace_move_extent2(c, k, &io_opts, &data_opts);
trace_io_move2(c, k, &io_opts, &data_opts);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
@ -300,15 +306,21 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.scrub) {
ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
io_opts, data_opts, iter->btree_id, k);
&io_opts, data_opts, iter->btree_id, k);
if (ret)
goto err_free;
io->write.op.end_io = move_write_done;
} else {
bch2_bkey_buf_init(&io->write.k);
bch2_bkey_buf_reassemble(&io->write.k, c, k);
io->write.op.c = c;
io->write.data_opts = data_opts;
ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
if (ret)
goto err_free;
}
io->write.rbio.bio.bi_end_io = move_read_endio;
@ -327,9 +339,7 @@ int bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&io->b->count);
}
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
trace_move_extent_read2(c, k);
trace_io_move_read2(c, k);
mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
@ -363,15 +373,15 @@ err:
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
count_event(c, move_extent_start_fail);
count_event(c, io_move_start_fail);
if (trace_move_extent_start_fail_enabled()) {
if (trace_io_move_start_fail_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, ": ");
prt_str(&buf, bch2_err_str(ret));
trace_move_extent_start_fail(c, buf.buf);
trace_io_move_start_fail(c, buf.buf);
printbuf_exit(&buf);
}
return ret;
@ -764,6 +774,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (!(data_types & BIT(bp.v->data_type)))
goto next;
if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
goto next;
k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -849,6 +862,7 @@ static int bch2_move_data_phys(struct bch_fs *c,
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
ctxt.stats->phys = true;
ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
@ -1038,14 +1052,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
static bool migrate_btree_pred(struct bch_fs *c, void *arg,
struct btree *b,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
/*
* Ancient versions of bcachefs produced packed formats which could represent
* keys that the in memory format cannot represent; this checks for those
@ -1174,6 +1180,12 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_scrub:
/*
* prevent tests from spuriously failing, make sure we see all
* btree nodes that need to be repaired
*/
bch2_btree_interior_updates_flush(c);
ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
op.scrub.data_types,
NULL,
@ -1202,14 +1214,14 @@ int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c, start, end,
migrate_btree_pred, &op, stats) ?: ret;
ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
~0,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
bch2_btree_interior_updates_flush(c);
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_rewrite_old_nodes:

View File

@ -168,8 +168,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
bch2_trans_begin(trans);
ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
0, k, ({
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
int ret2 = 0;

View File

@ -197,7 +197,7 @@ enum fsck_err_opts {
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or label for metadata writes") \

63
libbcachefs/progress.c Normal file
View File

@ -0,0 +1,63 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bbpos.h"
#include "disk_accounting.h"
#include "progress.h"
void bch2_progress_init(struct progress_indicator_state *s,
struct bch_fs *c,
u64 btree_id_mask)
{
memset(s, 0, sizeof(*s));
s->next_print = jiffies + HZ * 10;
for (unsigned i = 0; i < BTREE_ID_NR; i++) {
if (!(btree_id_mask & BIT_ULL(i)))
continue;
struct disk_accounting_pos acc = {
.type = BCH_DISK_ACCOUNTING_btree,
.btree.id = i,
};
u64 v;
bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
s->nodes_total += div64_ul(v, btree_sectors(c));
}
}
static inline bool progress_update_p(struct progress_indicator_state *s)
{
bool ret = time_after_eq(jiffies, s->next_print);
if (ret)
s->next_print = jiffies + HZ * 10;
return ret;
}
void bch2_progress_update_iter(struct btree_trans *trans,
struct progress_indicator_state *s,
struct btree_iter *iter,
const char *msg)
{
struct bch_fs *c = trans->c;
struct btree *b = path_l(btree_iter_path(trans, iter))->b;
s->nodes_seen += b != s->last_node;
s->last_node = b;
if (progress_update_p(s)) {
struct printbuf buf = PRINTBUF;
unsigned percent = s->nodes_total
? div64_u64(s->nodes_seen * 100, s->nodes_total)
: 0;
prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
msg, percent, s->nodes_seen, s->nodes_total);
bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
bch_info(c, "%s", buf.buf);
printbuf_exit(&buf);
}
}

29
libbcachefs/progress.h Normal file
View File

@ -0,0 +1,29 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_PROGRESS_H
#define _BCACHEFS_PROGRESS_H
/*
* Lame progress indicators
*
* We don't like to use these because they print to the dmesg console, which is
* spammy - we much prefer to be wired up to a userspace programm (e.g. via
* thread_with_file) and have it print the progress indicator.
*
* But some code is old and doesn't support that, or runs in a context where
* that's not yet practical (mount).
*/
struct progress_indicator_state {
unsigned long next_print;
u64 nodes_seen;
u64 nodes_total;
struct btree *last_node;
};
void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
void bch2_progress_update_iter(struct btree_trans *,
struct progress_indicator_state *,
struct btree_iter *,
const char *);
#endif /* _BCACHEFS_PROGRESS_H */

View File

@ -172,7 +172,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
bool should_commit)
{
if (REFLINK_P_ERROR(p.v))
return -BCH_ERR_missing_indirect_extent;
return 0;
struct bch_fs *c = trans->c;
u64 live_start = REFLINK_P_IDX(p.v);
@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
BUG_ON(missing_start < refd_start);
BUG_ON(missing_end > refd_end);
if (fsck_err(trans, reflink_p_to_missing_reflink_v,
"pointer to missing indirect extent\n"
" %s\n"
" missing range %llu-%llu",
(bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
missing_start, missing_end)) {
struct bpos missing_pos = bkey_start_pos(p.k);
missing_pos.offset += missing_start - live_start;
prt_printf(&buf, "pointer to missing indirect extent in ");
ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
if (ret)
goto err;
prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9);
bch2_bkey_val_to_text(&buf, c, p.s_c);
prt_printf(&buf, "\n missing reflink btree range %llu-%llu",
missing_start, missing_end);
if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
@ -259,8 +268,6 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
return k;
if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
bch2_trans_iter_exit(trans, iter);
unsigned size = min((u64) k.k->size,
REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
reflink_offset);
@ -268,14 +275,16 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
k.k->p.offset, should_commit);
if (ret)
if (ret) {
bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
}
} else if (unlikely(REFLINK_P_ERROR(p.v))) {
bch2_trans_iter_exit(trans, iter);
int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
if (ret)
if (ret) {
bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
}
}
*offset_into_extent = reflink_offset - bkey_start_offset(k.k);
@ -300,7 +309,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
if (ret)
return ret;
if (bkey_deleted(k.k)) {
if (!bkey_refcount_c(k)) {
if (!(flags & BTREE_TRIGGER_overwrite))
ret = -BCH_ERR_missing_indirect_extent;
goto next;

View File

@ -9,8 +9,20 @@ enum counters_flags {
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0, TYPE_SECTORS) \
x(io_read_inline, 80, TYPE_SECTORS) \
x(io_read_hole, 81, TYPE_SECTORS) \
x(io_read_promote, 30, TYPE_COUNTER) \
x(io_read_bounce, 31, TYPE_COUNTER) \
x(io_read_split, 33, TYPE_COUNTER) \
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
x(io_move_write, 36, TYPE_SECTORS) \
x(io_move_finish, 37, TYPE_SECTORS) \
x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
x(bucket_discard_fast, 79, TYPE_COUNTER) \
@ -39,16 +51,6 @@ enum counters_flags {
x(journal_reclaim_finish, 27, TYPE_COUNTER) \
x(journal_reclaim_start, 28, TYPE_COUNTER) \
x(journal_write, 29, TYPE_COUNTER) \
x(read_promote, 30, TYPE_COUNTER) \
x(read_bounce, 31, TYPE_COUNTER) \
x(read_split, 33, TYPE_COUNTER) \
x(read_retry, 32, TYPE_COUNTER) \
x(read_reuse_race, 34, TYPE_COUNTER) \
x(move_extent_read, 35, TYPE_SECTORS) \
x(move_extent_write, 36, TYPE_SECTORS) \
x(move_extent_finish, 37, TYPE_SECTORS) \
x(move_extent_fail, 38, TYPE_COUNTER) \
x(move_extent_start_fail, 39, TYPE_COUNTER) \
x(copygc, 40, TYPE_COUNTER) \
x(copygc_wait, 41, TYPE_COUNTER) \
x(gc_gens_end, 42, TYPE_COUNTER) \

View File

@ -92,8 +92,14 @@
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \
x(directory_size, \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
BCH_FSCK_ERR_directory_size_mismatch) \
x(cached_backpointers, \
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
BCH_FSCK_ERR_ptr_to_missing_backpointer) \
x(stripe_backpointers, \
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
BCH_FSCK_ERR_ptr_to_missing_backpointer)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \

View File

@ -180,9 +180,9 @@ enum bch_fsck_flags {
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, 0) \
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
x(reflink_p_to_missing_reflink_v, 166, 0) \
x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
x(reflink_refcount_underflow, 293, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \
@ -314,7 +314,9 @@ enum bch_fsck_flags {
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
x(MAX, 304, 0)
x(dirent_cf_name_too_big, 304, 0) \
x(dirent_stray_data_after_cf_name, 305, 0) \
x(MAX, 306, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,

View File

@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
goto out;
}
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
if (likely(ancestor >= IS_ANCESTOR_BITMAP))
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
ret = id && id < ancestor
? test_ancestor_bitmap(t, id, ancestor)
@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
return 0;
}
static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
{
u32 id = snapshot_root;
u32 subvol = 0, s;

View File

@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
return id;
}
u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32);
u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)

View File

@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
for (unsigned i = 0; i < 1000; i++) {
unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
old_name.len, old_name.name, i);
unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
if (u64s > U8_MAX)
return -EINVAL;

View File

@ -34,6 +34,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u8 type;
struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
* the siphash_key (k0) is used as the key.
@ -47,6 +48,9 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
/* XXX ick */
struct bch_hash_info info = {
.type = INODE_STR_HASH(bi),
#ifdef CONFIG_UNICODE
.cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL,
#endif
.siphash_key = { .k0 = bi->bi_hash_seed }
};

View File

@ -387,12 +387,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
block_size = le16_to_cpu(sb->block_size);
if (block_size > PAGE_SECTORS) {
prt_printf(out, "Block size too big (got %u, max %u)",
block_size, PAGE_SECTORS);
return -BCH_ERR_invalid_sb_block_size;
}
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
prt_printf(out, "Bad user UUID (got zeroes)");
return -BCH_ERR_invalid_sb_uuid;

View File

@ -837,6 +837,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
#ifdef CONFIG_UNICODE
/* Default encoding until we can potentially have more as an option. */
c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
if (IS_ERR(c->cf_encoding)) {
printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
ret = -EINVAL;
goto err;
}
#else
if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
ret = -EINVAL;
goto err;
}
#endif
pr_uuid(&name, c->sb.user_uuid.b);
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)

View File

@ -146,6 +146,7 @@ write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_btree_updates);
read_attribute(gc_gens_pos);
read_attribute(uuid);
@ -411,6 +412,9 @@ STORE(bch2_fs)
/* Debugging: */
if (attr == &sysfs_trigger_btree_updates)
queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
@ -580,6 +584,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
&sysfs_trigger_freelist_wakeup,
&sysfs_trigger_btree_updates,
&sysfs_gc_gens_pos,

View File

@ -295,12 +295,12 @@ TRACE_EVENT(write_super,
/* io.c: */
DEFINE_EVENT(bio, read_promote,
DEFINE_EVENT(bio, io_read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
TRACE_EVENT(read_nopromote,
TRACE_EVENT(io_read_nopromote,
TP_PROTO(struct bch_fs *c, int ret),
TP_ARGS(c, ret),
@ -319,22 +319,22 @@ TRACE_EVENT(read_nopromote,
__entry->ret)
);
DEFINE_EVENT(bio, read_bounce,
DEFINE_EVENT(bio, io_read_bounce,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
DEFINE_EVENT(bio, read_split,
DEFINE_EVENT(bio, io_read_split,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
DEFINE_EVENT(bio, read_retry,
DEFINE_EVENT(bio, io_read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
DEFINE_EVENT(bio, read_reuse_race,
DEFINE_EVENT(bio, io_read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@ -797,32 +797,32 @@ TRACE_EVENT(bucket_invalidate,
/* Moving IO */
DEFINE_EVENT(fs_str, move_extent,
DEFINE_EVENT(fs_str, io_move,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_read,
DEFINE_EVENT(fs_str, io_move_read,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_write,
DEFINE_EVENT(fs_str, io_move_write,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_finish,
DEFINE_EVENT(fs_str, io_move_finish,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_fail,
DEFINE_EVENT(fs_str, io_move_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(fs_str, move_extent_start_fail,
DEFINE_EVENT(fs_str, io_move_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);

View File

@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
u64 last_q = 0;
prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
eytzinger0_for_each(j, NR_QUANTILES) {
bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
u64 q = max(quantiles->entries[j].m, last_q);
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
@ -701,9 +701,9 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
#if 0
void eytzinger1_test(void)
{
unsigned inorder, eytz, size;
unsigned inorder, size;
pr_info("1 based eytzinger test:");
pr_info("1 based eytzinger test:\n");
for (size = 2;
size < 65536;
@ -711,13 +711,7 @@ void eytzinger1_test(void)
unsigned extra = eytzinger1_extra(size);
if (!(size % 4096))
pr_info("tree size %u", size);
BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
pr_info("tree size %u\n", size);
inorder = 1;
eytzinger1_for_each(eytz, size) {
@ -728,15 +722,16 @@ void eytzinger1_test(void)
inorder++;
}
BUG_ON(inorder - 1 != size);
}
}
void eytzinger0_test(void)
{
unsigned inorder, eytz, size;
unsigned inorder, size;
pr_info("0 based eytzinger test:");
pr_info("0 based eytzinger test:\n");
for (size = 1;
size < 65536;
@ -744,13 +739,7 @@ void eytzinger0_test(void)
unsigned extra = eytzinger0_extra(size);
if (!(size % 4096))
pr_info("tree size %u", size);
BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
pr_info("tree size %u\n", size);
inorder = 0;
eytzinger0_for_each(eytz, size) {
@ -761,37 +750,171 @@ void eytzinger0_test(void)
inorder++;
}
BUG_ON(inorder != size);
inorder = size - 1;
eytzinger0_for_each_prev(eytz, size) {
BUG_ON(eytz != eytzinger0_first(size) &&
eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
inorder--;
}
BUG_ON(inorder != -1);
}
}
static inline int cmp_u16(const void *_l, const void *_r, size_t size)
static inline int cmp_u16(const void *_l, const void *_r)
{
const u16 *l = _l, *r = _r;
return (*l > *r) - (*r - *l);
return (*l > *r) - (*r > *l);
}
static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
{
int i, c1 = -1, c2 = -1;
ssize_t r;
int r, s;
bool bad;
r = eytzinger0_find_le(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
if (r >= 0)
c1 = test_array[r];
for (i = 0; i < nr; i++)
if (test_array[i] <= search && test_array[i] > c2)
c2 = test_array[i];
if (c1 != c2) {
eytzinger0_for_each(i, nr)
pr_info("[%3u] = %12u", i, test_array[i]);
pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
i, r, c1, c2);
if (r >= 0) {
if (test_array[r] > search) {
bad = true;
} else {
s = eytzinger0_next(r, nr);
bad = s >= 0 && test_array[s] <= search;
}
} else {
s = eytzinger0_last(nr);
bad = s >= 0 && test_array[s] <= search;
}
if (bad) {
s = -1;
eytzinger0_for_each_prev(j, nr) {
if (test_array[j] <= search) {
s = j;
break;
}
}
eytzinger0_for_each(j, nr)
pr_info("[%3u] = %12u\n", j, test_array[j]);
pr_info("find_le(%12u) = %3i should be %3i\n",
search, r, s);
BUG();
}
}
static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
{
int r, s;
bool bad;
r = eytzinger0_find_gt(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
if (r >= 0) {
if (test_array[r] <= search) {
bad = true;
} else {
s = eytzinger0_prev(r, nr);
bad = s >= 0 && test_array[s] > search;
}
} else {
s = eytzinger0_first(nr);
bad = s >= 0 && test_array[s] > search;
}
if (bad) {
s = -1;
eytzinger0_for_each(j, nr) {
if (test_array[j] > search) {
s = j;
break;
}
}
eytzinger0_for_each(j, nr)
pr_info("[%3u] = %12u\n", j, test_array[j]);
pr_info("find_gt(%12u) = %3i should be %3i\n",
search, r, s);
BUG();
}
}
static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
{
int r, s;
bool bad;
r = eytzinger0_find_ge(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
if (r >= 0) {
if (test_array[r] < search) {
bad = true;
} else {
s = eytzinger0_prev(r, nr);
bad = s >= 0 && test_array[s] >= search;
}
} else {
s = eytzinger0_first(nr);
bad = s >= 0 && test_array[s] >= search;
}
if (bad) {
s = -1;
eytzinger0_for_each(j, nr) {
if (test_array[j] >= search) {
s = j;
break;
}
}
eytzinger0_for_each(j, nr)
pr_info("[%3u] = %12u\n", j, test_array[j]);
pr_info("find_ge(%12u) = %3i should be %3i\n",
search, r, s);
BUG();
}
}
static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
{
unsigned r;
int s;
bool bad;
r = eytzinger0_find(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
if (r < nr) {
bad = test_array[r] != search;
} else {
s = eytzinger0_find_le(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
bad = s >= 0 && test_array[s] == search;
}
if (bad) {
eytzinger0_for_each(j, nr)
pr_info("[%3u] = %12u\n", j, test_array[j]);
pr_info("find(%12u) = %3i is incorrect\n",
search, r);
BUG();
}
}
static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
{
eytzinger0_find_test_le(test_array, nr, search);
eytzinger0_find_test_gt(test_array, nr, search);
eytzinger0_find_test_ge(test_array, nr, search);
eytzinger0_find_test_eq(test_array, nr, search);
}
void eytzinger0_find_test(void)
@ -800,15 +923,18 @@ void eytzinger0_find_test(void)
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
for (nr = 1; nr < allocated; nr++) {
pr_info("testing %u elems", nr);
u16 prev = 0;
pr_info("testing %u elems\n", nr);
get_random_bytes(test_array, nr * sizeof(test_array[0]));
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
/* verify array is sorted correctly: */
eytzinger0_for_each(i, nr)
BUG_ON(i != eytzinger0_last(nr) &&
test_array[i] > test_array[eytzinger0_next(i, nr)]);
eytzinger0_for_each(j, nr) {
BUG_ON(test_array[j] < prev);
prev = test_array[j];
}
for (i = 0; i < U16_MAX; i += 1 << 12)
eytzinger0_find_test_val(test_array, nr, i);