Update bcachefs sources to f026e4e024

This commit is contained in:
Kent Overstreet 2017-04-14 20:38:49 -08:00
parent 03bc9d71b1
commit 819f2dde79
36 changed files with 962 additions and 435 deletions

View File

@ -1 +1 @@
3b4024f94489e4d8dc8eb7f1278754a2545f8026 f026e4e0243cc10e721504a8bfaa131ea8aa4c91

View File

@ -78,7 +78,7 @@ SRCS=bcachefs.c \
libbcachefs/dirent.c \ libbcachefs/dirent.c \
libbcachefs/error.c \ libbcachefs/error.c \
libbcachefs/extents.c \ libbcachefs/extents.c \
libbcachefs/fs-gc.c \ libbcachefs/fsck.c \
libbcachefs/inode.c \ libbcachefs/inode.c \
libbcachefs/io.c \ libbcachefs/io.c \
libbcachefs/journal.c \ libbcachefs/journal.c \

View File

@ -288,8 +288,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
{ {
} }
extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter, extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter src_iter); struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_copy_data(struct bio *dst, struct bio *src);
extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);

View File

@ -458,6 +458,7 @@ enum {
BCH_FS_BDEV_MOUNTED, BCH_FS_BDEV_MOUNTED,
BCH_FS_ERROR, BCH_FS_ERROR,
BCH_FS_FSCK_FIXED_ERRORS, BCH_FS_FSCK_FIXED_ERRORS,
BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS, BCH_FS_FIXED_GENS,
}; };
@ -724,6 +725,11 @@ struct bch_fs {
struct work_struct read_retry_work; struct work_struct read_retry_work;
spinlock_t read_retry_lock; spinlock_t read_retry_lock;
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;
bool fsck_alloc_err;
/* FILESYSTEM */ /* FILESYSTEM */
wait_queue_head_t writeback_wait; wait_queue_head_t writeback_wait;
atomic_t writeback_pages; atomic_t writeback_pages;

View File

@ -89,18 +89,20 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
ops->key_debugcheck(c, b, k); ops->key_debugcheck(c, b, k);
} }
void bch2_val_to_text(struct bch_fs *c, enum bkey_type type, char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k) char *buf, size_t size, struct bkey_s_c k)
{ {
const struct bkey_ops *ops = bch2_bkey_ops[type]; const struct bkey_ops *ops = bch2_bkey_ops[type];
if (k.k->type >= KEY_TYPE_GENERIC_NR && if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->val_to_text) ops->val_to_text)
ops->val_to_text(c, buf, size, k); ops->val_to_text(c, buf, size, k);
return buf;
} }
void bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k) char *buf, size_t size, struct bkey_s_c k)
{ {
const struct bkey_ops *ops = bch2_bkey_ops[type]; const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size; char *out = buf, *end = buf + size;
@ -109,9 +111,11 @@ void bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
if (k.k->type >= KEY_TYPE_GENERIC_NR && if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->val_to_text) { ops->val_to_text) {
out += scnprintf(out, end - out, " -> "); out += scnprintf(out, end - out, ": ");
ops->val_to_text(c, out, end - out, k); ops->val_to_text(c, out, end - out, k);
} }
return buf;
} }
void bch2_bkey_swab(enum bkey_type type, void bch2_bkey_swab(enum bkey_type type,

View File

@ -67,10 +67,10 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
struct bkey_s_c); struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
void bch2_val_to_text(struct bch_fs *, enum bkey_type, char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c); char *, size_t, struct bkey_s_c);
void bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c); char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *); struct bkey_packed *);

View File

@ -91,6 +91,7 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
six_lock_init(&b->lock); six_lock_init(&b->lock);
INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked); INIT_LIST_HEAD(&b->write_blocked);
INIT_LIST_HEAD(&b->reachable);
mca_data_alloc(c, b, gfp); mca_data_alloc(c, b, gfp);
return b->data ? b : NULL; return b->data ? b : NULL;

View File

@ -605,10 +605,12 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]); bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]);
/* Repack everything with @new_format and sort down to one bset */ /* Repack everything with @new_format and sort down to one bset */
for (i = 0; i < nr_old_nodes; i++) for (i = 0; i < nr_old_nodes; i++) {
new_nodes[i] = new_nodes[i] =
__bch2_btree_node_alloc_replacement(c, old_nodes[i], __bch2_btree_node_alloc_replacement(c, old_nodes[i],
new_format, res); new_format, res);
list_add(&new_nodes[i]->reachable, &as->reachable_list);
}
/* /*
* Conceptually we concatenate the nodes together and slice them * Conceptually we concatenate the nodes together and slice them
@ -645,6 +647,7 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
set_btree_bset_end(n1, n1->set); set_btree_bset_end(n1, n1->set);
list_del_init(&n2->reachable);
six_unlock_write(&n2->lock); six_unlock_write(&n2->lock);
bch2_btree_node_free_never_inserted(c, n2); bch2_btree_node_free_never_inserted(c, n2);
six_unlock_intent(&n2->lock); six_unlock_intent(&n2->lock);

View File

@ -872,32 +872,57 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
vstruct_end(i) - (void *) i->_data); vstruct_end(i) - (void *) i->_data);
} }
#define btree_node_error(b, c, ptr, fmt, ...) \ #define btree_node_error(c, b, ptr, msg, ...) \
bch2_fs_inconsistent(c, \ do { \
"btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\ if (write == READ && \
(b)->btree_id, (b)->level, btree_node_root(c, b) \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
? btree_node_root(c, b)->level : -1, \ mustfix_fsck_err(c, \
PTR_BUCKET_NR(ca, ptr), (b)->written, \ "btree node read error at btree %u level %u/%u\n"\
le16_to_cpu((i)->u64s), ##__VA_ARGS__) "sector %llu node offset %u bset u64s %u: " msg,\
(b)->btree_id, (b)->level, \
(c)->btree_roots[(b)->btree_id].level, \
(u64) ptr->offset, (b)->written, \
le16_to_cpu((i)->u64s), ##__VA_ARGS__); \
} else { \
bch_err(c, "%s at btree %u level %u/%u\n" \
"sector %llu node offset %u bset u64s %u: " msg,\
write == WRITE \
? "corrupt metadata in btree node write" \
: "btree node error", \
(b)->btree_id, (b)->level, \
(c)->btree_roots[(b)->btree_id].level, \
(u64) ptr->offset, (b)->written, \
le16_to_cpu((i)->u64s), ##__VA_ARGS__); \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
} while (0)
static const char *validate_bset(struct bch_fs *c, struct btree *b, static int validate_bset(struct bch_fs *c, struct btree *b,
struct bch_dev *ca, const struct bch_extent_ptr *ptr,
const struct bch_extent_ptr *ptr, struct bset *i, unsigned sectors,
struct bset *i, unsigned sectors, unsigned *whiteout_u64s,
unsigned *whiteout_u64s) int write)
{ {
struct bkey_packed *k, *prev = NULL; struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN; struct bpos prev_pos = POS_MIN;
bool seen_non_whiteout = false; bool seen_non_whiteout = false;
int ret = 0;
if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
return "unsupported bset version"; btree_node_error(c, b, ptr, "unsupported bset version");
i->u64s = 0;
return 0;
}
if (b->written + sectors > c->sb.btree_node_size) if (b->written + sectors > c->sb.btree_node_size) {
return "bset past end of btree node"; btree_node_error(c, b, ptr, "bset past end of btree node");
i->u64s = 0;
return 0;
}
if (i != &b->data->keys && !i->u64s) if (b->written && !i->u64s)
btree_node_error(b, c, ptr, "empty set"); btree_node_error(c, b, ptr, "empty set");
if (!BSET_SEPARATE_WHITEOUTS(i)) { if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true; seen_non_whiteout = true;
@ -911,7 +936,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
const char *invalid; const char *invalid;
if (!k->u64s) { if (!k->u64s) {
btree_node_error(b, c, ptr, btree_node_error(c, b, ptr,
"KEY_U64s 0: %zu bytes of metadata lost", "KEY_U64s 0: %zu bytes of metadata lost",
vstruct_end(i) - (void *) k); vstruct_end(i) - (void *) k);
@ -920,7 +945,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
} }
if (bkey_next(k) > vstruct_last(i)) { if (bkey_next(k) > vstruct_last(i)) {
btree_node_error(b, c, ptr, btree_node_error(c, b, ptr,
"key extends past end of bset"); "key extends past end of bset");
i->u64s = cpu_to_le16((u64 *) k - i->_data); i->u64s = cpu_to_le16((u64 *) k - i->_data);
@ -928,7 +953,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
} }
if (k->format > KEY_FORMAT_CURRENT) { if (k->format > KEY_FORMAT_CURRENT) {
btree_node_error(b, c, ptr, btree_node_error(c, b, ptr,
"invalid bkey format %u", k->format); "invalid bkey format %u", k->format);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@ -947,8 +972,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
char buf[160]; char buf[160];
bch2_bkey_val_to_text(c, btree_node_type(b), bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), u); buf, sizeof(buf), u);
btree_node_error(b, c, ptr, btree_node_error(c, b, ptr,
"invalid bkey %s: %s", buf, invalid); "invalid bkey %s: %s", buf, invalid);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@ -969,7 +994,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
*whiteout_u64s = k->_data - i->_data; *whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true; seen_non_whiteout = true;
} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
btree_node_error(b, c, ptr, btree_node_error(c, b, ptr,
"keys out of order: %llu:%llu > %llu:%llu", "keys out of order: %llu:%llu > %llu:%llu",
prev_pos.inode, prev_pos.inode,
prev_pos.offset, prev_pos.offset,
@ -984,7 +1009,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
} }
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
return NULL; fsck_err:
return ret;
} }
static bool extent_contains_ptr(struct bkey_s_c_extent e, static bool extent_contains_ptr(struct bkey_s_c_extent e,
@ -1012,7 +1038,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
const char *err; const char *err;
struct bch_csum csum; struct bch_csum csum;
struct nonce nonce; struct nonce nonce;
int ret; int ret, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO); iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
__bch2_btree_node_iter_init(iter, btree_node_is_extents(b)); __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
@ -1115,9 +1141,10 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
sectors = vstruct_sectors(bne, c->block_bits); sectors = vstruct_sectors(bne, c->block_bits);
} }
err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s); ret = validate_bset(c, b, ptr, i, sectors,
if (err) &whiteout_u64s, READ);
goto err; if (ret)
goto fsck_err;
b->written += sectors; b->written += sectors;
@ -1172,8 +1199,10 @@ out:
mempool_free(iter, &c->fill_iter); mempool_free(iter, &c->fill_iter);
return; return;
err: err:
btree_node_error(c, b, ptr, "%s", err);
fsck_err:
bch2_inconsistent_error(c);
set_btree_node_read_error(b); set_btree_node_read_error(b);
btree_node_error(b, c, ptr, "%s", err);
goto out; goto out;
} }
@ -1309,6 +1338,23 @@ static void btree_node_write_endio(struct bio *bio)
} }
} }
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
const struct bch_extent_ptr *ptr;
unsigned whiteout_u64s = 0;
int ret;
extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
break;
ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
if (ret)
bch2_fatal_error(c);
return ret;
}
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent, struct closure *parent,
enum six_lock_type lock_type_held) enum six_lock_type lock_type_held)
@ -1343,18 +1389,24 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (!(old & (1 << BTREE_NODE_dirty))) if (!(old & (1 << BTREE_NODE_dirty)))
return; return;
if (b->written &&
!btree_node_may_write(b))
return;
if (old & (1 << BTREE_NODE_write_in_flight)) { if (old & (1 << BTREE_NODE_write_in_flight)) {
btree_node_wait_on_io(b); btree_node_wait_on_io(b);
continue; continue;
} }
new &= ~(1 << BTREE_NODE_dirty); new &= ~(1 << BTREE_NODE_dirty);
new &= ~(1 << BTREE_NODE_need_write);
new |= (1 << BTREE_NODE_write_in_flight); new |= (1 << BTREE_NODE_write_in_flight);
new |= (1 << BTREE_NODE_just_written); new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx); new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old); } while (cmpxchg_acquire(&b->flags, old, new) != old);
BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(!list_empty_careful(&b->reachable) != !b->written);
BUG_ON(b->written >= c->sb.btree_node_size); BUG_ON(b->written >= c->sb.btree_node_size);
BUG_ON(bset_written(b, btree_bset_last(b))); BUG_ON(bset_written(b, btree_bset_last(b)));
@ -1430,13 +1482,17 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
clear_needs_whiteout(i); clear_needs_whiteout(i);
if (b->written && !i->u64s) { /* do we have data to write? */
/* Nothing to write: */ if (b->written && !i->u64s)
btree_bounce_free(c, order, used_mempool, data); goto nowrite;
btree_node_write_done(c, b);
return;
}
bytes_to_write = vstruct_end(i) - data;
sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq); BUG_ON(i->seq != b->data->keys.seq);
@ -1445,6 +1501,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
nonce = btree_nonce(b, i, b->written << 9); nonce = btree_nonce(b, i, b->written << 9);
/* if we're going to be encrypting, check metadata validity first: */
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
validate_bset_for_write(c, b, i, sectors_to_write))
goto err;
if (bn) { if (bn) {
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
&bn->flags, &bn->flags,
@ -1464,15 +1525,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
} }
bytes_to_write = vstruct_end(i) - data; /* if we're not encrypting, check metadata after checksumming: */
sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
validate_bset_for_write(c, b, i, sectors_to_write))
memset(data + bytes_to_write, 0, goto err;
(sectors_to_write << 9) - bytes_to_write);
BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
trace_btree_write(b, bytes_to_write, sectors_to_write);
/* /*
* We handle btree write errors by immediately halting the journal - * We handle btree write errors by immediately halting the journal -
@ -1488,14 +1544,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
* break: * break:
*/ */
if (bch2_journal_error(&c->journal) || if (bch2_journal_error(&c->journal) ||
c->opts.nochanges) { c->opts.nochanges)
set_btree_node_noevict(b); goto err;
b->written += sectors_to_write;
btree_bounce_free(c, order, used_mempool, data); trace_btree_write(b, bytes_to_write, sectors_to_write);
btree_node_write_done(c, b);
return;
}
bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
@ -1543,6 +1595,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
b->written += sectors_to_write; b->written += sectors_to_write;
bch2_submit_wbio_replicas(wbio, c, &k.key); bch2_submit_wbio_replicas(wbio, c, &k.key);
return;
err:
set_btree_node_noevict(b);
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, order, used_mempool, data);
btree_node_write_done(c, b);
} }
/* /*

View File

@ -27,7 +27,8 @@ static inline void btree_node_wait_on_io(struct btree *b)
static inline bool btree_node_may_write(struct btree *b) static inline bool btree_node_may_write(struct btree *b)
{ {
return list_empty_careful(&b->write_blocked); return list_empty_careful(&b->write_blocked) &&
list_empty_careful(&b->reachable);
} }
enum compact_mode { enum compact_mode {
@ -80,6 +81,8 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *,
#define bch2_btree_node_write_dirty(_c, _b, _cl, cond) \ #define bch2_btree_node_write_dirty(_c, _b, _cl, cond) \
do { \ do { \
while ((_b)->written && btree_node_dirty(_b) && (cond)) { \ while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
set_btree_node_need_write(_b); \
\
if (!btree_node_may_write(_b)) \ if (!btree_node_may_write(_b)) \
break; \ break; \
\ \

View File

@ -1109,6 +1109,26 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
prefetch(c->btree_roots[btree_id].b); prefetch(c->btree_roots[btree_id].b);
} }
void bch2_btree_iter_unlink(struct btree_iter *iter)
{
struct btree_iter *linked;
__bch2_btree_iter_unlock(iter);
if (!btree_iter_linked(iter))
return;
for_each_linked_btree_iter(iter, linked) {
if (linked->next == iter) {
linked->next = iter->next;
return;
}
}
BUG();
}
void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
{ {
BUG_ON(btree_iter_linked(new)); BUG_ON(btree_iter_linked(new));
@ -1128,7 +1148,7 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
{ {
bch2_btree_iter_unlock(dst); __bch2_btree_iter_unlock(dst);
memcpy(dst, src, offsetof(struct btree_iter, next)); memcpy(dst, src, offsetof(struct btree_iter, next));
dst->nodes_locked = dst->nodes_intent_locked = 0; dst->nodes_locked = dst->nodes_intent_locked = 0;
} }

View File

@ -185,6 +185,7 @@ static inline void bch2_btree_iter_init_intent(struct btree_iter *iter,
} }
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *); void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
void bch2_btree_iter_unlink(struct btree_iter *);
void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *); void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
static inline struct bpos btree_type_successor(enum btree_id id, static inline struct bpos btree_type_successor(enum btree_id id,

View File

@ -110,6 +110,14 @@ struct btree {
*/ */
struct list_head write_blocked; struct list_head write_blocked;
/*
* Also for asynchronous splits/interior node updates:
* If a btree node isn't reachable yet, we don't want to kick off
* another write - because that write also won't yet be reachable and
* marking it as completed before it's reachable would be incorrect:
*/
struct list_head reachable;
struct open_bucket *ob; struct open_bucket *ob;
/* lru list */ /* lru list */
@ -136,6 +144,7 @@ enum btree_flags {
BTREE_NODE_read_error, BTREE_NODE_read_error,
BTREE_NODE_write_error, BTREE_NODE_write_error,
BTREE_NODE_dirty, BTREE_NODE_dirty,
BTREE_NODE_need_write,
BTREE_NODE_noevict, BTREE_NODE_noevict,
BTREE_NODE_write_idx, BTREE_NODE_write_idx,
BTREE_NODE_accessed, BTREE_NODE_accessed,
@ -146,6 +155,7 @@ enum btree_flags {
BTREE_FLAG(read_error); BTREE_FLAG(read_error);
BTREE_FLAG(write_error); BTREE_FLAG(write_error);
BTREE_FLAG(dirty); BTREE_FLAG(dirty);
BTREE_FLAG(need_write);
BTREE_FLAG(noevict); BTREE_FLAG(noevict);
BTREE_FLAG(write_idx); BTREE_FLAG(write_idx);
BTREE_FLAG(accessed); BTREE_FLAG(accessed);

View File

@ -162,9 +162,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
trace_btree_node_free(c, b); trace_btree_node_free(c, b);
BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
BUG_ON(b == btree_node_root(c, b)); BUG_ON(b == btree_node_root(c, b));
BUG_ON(b->ob); BUG_ON(b->ob);
BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(!list_empty(&b->reachable));
clear_btree_node_noevict(b); clear_btree_node_noevict(b);
@ -589,7 +591,6 @@ struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes; unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
return __bch2_btree_reserve_get(c, nr_nodes, flags, cl); return __bch2_btree_reserve_get(c, nr_nodes, flags, cl);
} }
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@ -598,6 +599,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
struct closure cl; struct closure cl;
struct btree_reserve *reserve; struct btree_reserve *reserve;
struct btree *b; struct btree *b;
LIST_HEAD(reachable_list);
closure_init_stack(&cl); closure_init_stack(&cl);
@ -614,11 +616,14 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
} }
b = __btree_root_alloc(c, 0, id, reserve); b = __btree_root_alloc(c, 0, id, reserve);
list_add(&b->reachable, &reachable_list);
bch2_btree_node_write(c, b, writes, SIX_LOCK_intent); bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
bch2_btree_set_root_initial(c, b, reserve); bch2_btree_set_root_initial(c, b, reserve);
bch2_btree_open_bucket_put(c, b); bch2_btree_open_bucket_put(c, b);
list_del_init(&b->reachable);
six_unlock_intent(&b->lock); six_unlock_intent(&b->lock);
bch2_btree_reserve_put(c, reserve); bch2_btree_reserve_put(c, reserve);
@ -659,6 +664,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_iter *iter,
bch2_btree_bset_insert_key(iter, b, node_iter, insert); bch2_btree_bset_insert_key(iter, b, node_iter, insert);
set_btree_node_dirty(b); set_btree_node_dirty(b);
set_btree_node_need_write(b);
} }
/* Inserting into a given leaf node (last stage of insert): */ /* Inserting into a given leaf node (last stage of insert): */
@ -798,12 +804,6 @@ void bch2_btree_journal_key(struct btree_insert *trans,
u64 seq = trans->journal_res.seq; u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout; bool needs_whiteout = insert->k.needs_whiteout;
/*
* have a bug where we're seeing an extent with an invalid crc
* entry in the journal, trying to track it down:
*/
BUG_ON(bch2_bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
/* ick */ /* ick */
insert->k.needs_whiteout = false; insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res, bch2_journal_add_keys(j, &trans->journal_res,
@ -878,6 +878,8 @@ bch2_btree_interior_update_alloc(struct bch_fs *c)
closure_init(&as->cl, &c->cl); closure_init(&as->cl, &c->cl);
as->c = c; as->c = c;
as->mode = BTREE_INTERIOR_NO_UPDATE; as->mode = BTREE_INTERIOR_NO_UPDATE;
INIT_LIST_HEAD(&as->write_blocked_list);
INIT_LIST_HEAD(&as->reachable_list);
bch2_keylist_init(&as->parent_keys, as->inline_keys, bch2_keylist_init(&as->parent_keys, as->inline_keys,
ARRAY_SIZE(as->inline_keys)); ARRAY_SIZE(as->inline_keys));
@ -908,6 +910,18 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
while (!list_empty(&as->reachable_list)) {
struct btree *b = list_first_entry(&as->reachable_list,
struct btree, reachable);
list_del_init(&b->reachable);
mutex_unlock(&c->btree_interior_update_lock);
six_lock_read(&b->lock);
bch2_btree_node_write_dirty(c, b, NULL, btree_node_need_write(b));
six_unlock_read(&b->lock);
mutex_lock(&c->btree_interior_update_lock);
}
for (i = 0; i < as->nr_pending; i++) for (i = 0; i < as->nr_pending; i++)
bch2_btree_node_free_ondisk(c, &as->pending[i]); bch2_btree_node_free_ondisk(c, &as->pending[i]);
as->nr_pending = 0; as->nr_pending = 0;
@ -929,6 +943,7 @@ static void btree_interior_update_nodes_written(struct closure *cl)
if (bch2_journal_error(&c->journal)) { if (bch2_journal_error(&c->journal)) {
/* XXX what? */ /* XXX what? */
/* we don't want to free the nodes on disk, that's what */
} }
/* XXX: missing error handling, damnit */ /* XXX: missing error handling, damnit */
@ -962,7 +977,8 @@ retry:
list_del(&as->write_blocked_list); list_del(&as->write_blocked_list);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
bch2_btree_node_write_dirty(c, b, NULL, true); bch2_btree_node_write_dirty(c, b, NULL,
btree_node_need_write(b));
six_unlock_read(&b->lock); six_unlock_read(&b->lock);
break; break;
@ -1135,6 +1151,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
} }
clear_btree_node_dirty(b); clear_btree_node_dirty(b);
clear_btree_node_need_write(b);
w = btree_current_write(b); w = btree_current_write(b);
llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list) llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
@ -1152,6 +1169,8 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
&as->journal, interior_update_flush); &as->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &w->journal); bch2_journal_pin_drop(&c->journal, &w->journal);
if (!list_empty(&b->reachable))
list_del_init(&b->reachable);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
} }
@ -1265,7 +1284,8 @@ bch2_btree_insert_keys_interior(struct btree *b,
* node) * node)
*/ */
static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1, static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
struct btree_reserve *reserve) struct btree_reserve *reserve,
struct btree_interior_update *as)
{ {
size_t nr_packed = 0, nr_unpacked = 0; size_t nr_packed = 0, nr_unpacked = 0;
struct btree *n2; struct btree *n2;
@ -1273,6 +1293,8 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
struct bkey_packed *k, *prev = NULL; struct bkey_packed *k, *prev = NULL;
n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve); n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
list_add(&n2->reachable, &as->reachable_list);
n2->data->max_key = n1->data->max_key; n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format; n2->data->format = n1->format;
n2->key.k.p = n1->key.k.p; n2->key.k.p = n1->key.k.p;
@ -1421,13 +1443,15 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
bch2_btree_interior_update_will_free_node(c, as, b); bch2_btree_interior_update_will_free_node(c, as, b);
n1 = bch2_btree_node_alloc_replacement(c, b, reserve); n1 = bch2_btree_node_alloc_replacement(c, b, reserve);
list_add(&n1->reachable, &as->reachable_list);
if (b->level) if (b->level)
btree_split_insert_keys(iter, n1, insert_keys, reserve); btree_split_insert_keys(iter, n1, insert_keys, reserve);
if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_node_split(c, b, b->nr.live_u64s); trace_btree_node_split(c, b, b->nr.live_u64s);
n2 = __btree_split_node(iter, n1, reserve); n2 = __btree_split_node(iter, n1, reserve, as);
bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1); bch2_btree_build_aux_trees(n1);
@ -1449,6 +1473,8 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
n3 = __btree_root_alloc(c, b->level + 1, n3 = __btree_root_alloc(c, b->level + 1,
iter->btree_id, iter->btree_id,
reserve); reserve);
list_add(&n3->reachable, &as->reachable_list);
n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX; n3->sib_u64s[1] = U16_MAX;
@ -1748,6 +1774,8 @@ retry:
bch2_btree_interior_update_will_free_node(c, as, m); bch2_btree_interior_update_will_free_node(c, as, m);
n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve); n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
list_add(&n->reachable, &as->reachable_list);
n->data->min_key = prev->data->min_key; n->data->min_key = prev->data->min_key;
n->data->max_key = next->data->max_key; n->data->max_key = next->data->max_key;
n->data->format = new_f; n->data->format = new_f;
@ -1914,8 +1942,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
int ret; int ret;
trans_for_each_entry(trans, i) { trans_for_each_entry(trans, i) {
EBUG_ON(i->iter->level); BUG_ON(i->iter->level);
EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
} }
sort(trans->entries, trans->nr, sizeof(trans->entries[0]), sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
@ -2076,6 +2104,19 @@ err:
goto out; goto out;
} }
int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
{
struct bkey_i k;
bkey_init(&k.k);
k.k.p = iter->pos;
return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|flags,
BTREE_INSERT_ENTRY(iter, &k));
}
int bch2_btree_insert_list_at(struct btree_iter *iter, int bch2_btree_insert_list_at(struct btree_iter *iter,
struct keylist *keys, struct keylist *keys,
struct disk_reservation *disk_res, struct disk_reservation *disk_res,
@ -2104,45 +2145,6 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
return 0; return 0;
} }
/**
* bch_btree_insert_check_key - insert dummy key into btree
*
* We insert a random key on a cache miss, then compare exchange on it
* once the cache promotion or backing device read completes. This
* ensures that if this key is written to after the read, the read will
* lose and not overwrite the key with stale data.
*
* Return values:
* -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
* -EINTR: btree node was changed while upgrading to write lock
*/
int bch2_btree_insert_check_key(struct btree_iter *iter,
struct bkey_i *check_key)
{
struct bpos saved_pos = iter->pos;
struct bkey_i_cookie *cookie;
BKEY_PADDED(key) tmp;
int ret;
BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
check_key->k.type = KEY_TYPE_COOKIE;
set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
cookie = bkey_i_to_cookie(check_key);
get_random_bytes(&cookie->v, sizeof(cookie->v));
bkey_copy(&tmp.key, check_key);
ret = bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(iter, &tmp.key));
bch2_btree_iter_rewind(iter, saved_pos);
return ret;
}
/** /**
* bch_btree_insert - insert keys into the extent btree * bch_btree_insert - insert keys into the extent btree
* @c: pointer to struct bch_fs * @c: pointer to struct bch_fs
@ -2310,6 +2312,7 @@ int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
bch2_btree_interior_update_will_free_node(c, as, b); bch2_btree_interior_update_will_free_node(c, as, b);
n = bch2_btree_node_alloc_replacement(c, b, reserve); n = bch2_btree_node_alloc_replacement(c, b, reserve);
list_add(&n->reachable, &as->reachable_list);
bch2_btree_build_aux_trees(n); bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock); six_unlock_write(&n->lock);

View File

@ -64,7 +64,7 @@ struct pending_btree_node_free {
*/ */
struct btree_interior_update { struct btree_interior_update {
struct closure cl; struct closure cl;
struct bch_fs *c; struct bch_fs *c;
struct list_head list; struct list_head list;
@ -86,6 +86,7 @@ struct btree_interior_update {
*/ */
struct btree *b; struct btree *b;
struct list_head write_blocked_list; struct list_head write_blocked_list;
struct list_head reachable_list;
/* /*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
@ -317,7 +318,6 @@ struct btree_insert {
int __bch2_btree_insert_at(struct btree_insert *); int __bch2_btree_insert_at(struct btree_insert *);
#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N #define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1) #define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
@ -380,6 +380,8 @@ int __bch2_btree_insert_at(struct btree_insert *);
*/ */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) #define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
int bch2_btree_delete_at(struct btree_iter *, unsigned);
int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
struct disk_reservation *, struct disk_reservation *,
struct extent_insert_hook *, u64 *, unsigned); struct extent_insert_hook *, u64 *, unsigned);
@ -403,7 +405,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
return u64s <= trans->journal_res.u64s; return u64s <= trans->journal_res.u64s;
} }
int bch2_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, struct disk_reservation *,
struct extent_insert_hook *, u64 *, int flags); struct extent_insert_hook *, u64 *, int flags);

View File

@ -15,7 +15,7 @@
#include "debug.h" #include "debug.h"
#include "error.h" #include "error.h"
#include "extents.h" #include "extents.h"
#include "fs-gc.h" #include "fsck.h"
#include "inode.h" #include "inode.h"
#include "io.h" #include "io.h"
#include "super.h" #include "super.h"

View File

@ -20,6 +20,11 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
return len; return len;
} }
static unsigned dirent_val_u64s(unsigned len)
{
return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64));
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info, static u64 bch2_dirent_hash(const struct bch_hash_info *info,
const struct qstr *name) const struct qstr *name)
{ {
@ -64,7 +69,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
} }
static const struct bch_hash_desc dirent_hash_desc = { const struct bch_hash_desc bch2_dirent_hash_desc = {
.btree_id = BTREE_ID_DIRENTS, .btree_id = BTREE_ID_DIRENTS,
.key_type = BCH_DIRENT, .key_type = BCH_DIRENT,
.whiteout_type = BCH_DIRENT_WHITEOUT, .whiteout_type = BCH_DIRENT_WHITEOUT,
@ -77,12 +82,30 @@ static const struct bch_hash_desc dirent_hash_desc = {
static const char *bch2_dirent_invalid(const struct bch_fs *c, static const char *bch2_dirent_invalid(const struct bch_fs *c,
struct bkey_s_c k) struct bkey_s_c k)
{ {
struct bkey_s_c_dirent d;
unsigned len;
switch (k.k->type) { switch (k.k->type) {
case BCH_DIRENT: case BCH_DIRENT:
return bkey_val_bytes(k.k) < sizeof(struct bch_dirent) if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
? "value too small" return "value too small";
: NULL;
d = bkey_s_c_to_dirent(k);
len = bch2_dirent_name_bytes(d);
if (!len)
return "empty name";
if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
return "value too big";
if (len > NAME_MAX)
return "dirent name too big";
if (memchr(d.v->d_name, '/', len))
return "dirent name has invalid characters";
return NULL;
case BCH_DIRENT_WHITEOUT: case BCH_DIRENT_WHITEOUT:
return bkey_val_bytes(k.k) != 0 return bkey_val_bytes(k.k) != 0
? "value size should be zero" ? "value size should be zero"
@ -97,21 +120,15 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k) size_t size, struct bkey_s_c k)
{ {
struct bkey_s_c_dirent d; struct bkey_s_c_dirent d;
size_t n = 0;
switch (k.k->type) { switch (k.k->type) {
case BCH_DIRENT: case BCH_DIRENT:
d = bkey_s_c_to_dirent(k); d = bkey_s_c_to_dirent(k);
if (size) { n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
unsigned n = min_t(unsigned, size, bch2_dirent_name_bytes(d));
bch2_dirent_name_bytes(d)); n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
memcpy(buf, d.v->d_name, n);
buf[size - 1] = '\0';
buf += n;
size -= n;
}
scnprintf(buf, size, " -> %llu", d.v->d_inum);
break; break;
case BCH_DIRENT_WHITEOUT: case BCH_DIRENT_WHITEOUT:
scnprintf(buf, size, "whiteout"); scnprintf(buf, size, "whiteout");
@ -128,9 +145,7 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
const struct qstr *name, u64 dst) const struct qstr *name, u64 dst)
{ {
struct bkey_i_dirent *dirent; struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
sizeof(u64));
dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS); dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (!dirent) if (!dirent)
@ -163,7 +178,7 @@ int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
if (!dirent) if (!dirent)
return -ENOMEM; return -ENOMEM;
ret = bch2_hash_set(dirent_hash_desc, hash_info, c, dir_inum, ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
journal_seq, &dirent->k_i, flags); journal_seq, &dirent->k_i, flags);
kfree(dirent); kfree(dirent);
@ -223,13 +238,13 @@ retry:
* from the original hashed position (like we do when creating dirents, * from the original hashed position (like we do when creating dirents,
* in bch_hash_set) - we never move existing dirents to different slot: * in bch_hash_set) - we never move existing dirents to different slot:
*/ */
old_src = bch2_hash_lookup_at(dirent_hash_desc, old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc,
&src_ei->str_hash, &src_ei->str_hash,
&src_iter, src_name); &src_iter, src_name);
if ((ret = btree_iter_err(old_src))) if ((ret = btree_iter_err(old_src)))
goto err; goto err;
ret = bch2_hash_needs_whiteout(dirent_hash_desc, ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc,
&src_ei->str_hash, &src_ei->str_hash,
&whiteout_iter, &src_iter); &whiteout_iter, &src_iter);
if (ret < 0) if (ret < 0)
@ -242,8 +257,8 @@ retry:
* to do that check for us for correctness: * to do that check for us for correctness:
*/ */
old_dst = mode == BCH_RENAME old_dst = mode == BCH_RENAME
? bch2_hash_hole_at(dirent_hash_desc, &dst_iter) ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter)
: bch2_hash_lookup_at(dirent_hash_desc, : bch2_hash_lookup_at(bch2_dirent_hash_desc,
&dst_ei->str_hash, &dst_ei->str_hash,
&dst_iter, dst_name); &dst_iter, dst_name);
if ((ret = btree_iter_err(old_dst))) if ((ret = btree_iter_err(old_dst)))
@ -330,7 +345,7 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
const struct qstr *name, const struct qstr *name,
u64 *journal_seq) u64 *journal_seq)
{ {
return bch2_hash_delete(dirent_hash_desc, hash_info, return bch2_hash_delete(bch2_dirent_hash_desc, hash_info,
c, dir_inum, journal_seq, name); c, dir_inum, journal_seq, name);
} }
@ -342,7 +357,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
struct bkey_s_c k; struct bkey_s_c k;
u64 inum; u64 inum;
k = bch2_hash_lookup(dirent_hash_desc, hash_info, c, k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c,
dir_inum, &iter, name); dir_inum, &iter, name);
if (IS_ERR(k.k)) { if (IS_ERR(k.k)) {
bch2_btree_iter_unlock(&iter); bch2_btree_iter_unlock(&iter);

View File

@ -1,6 +1,9 @@
#ifndef _BCACHE_DIRENT_H #ifndef _BCACHE_DIRENT_H
#define _BCACHE_DIRENT_H #define _BCACHE_DIRENT_H
#include "str_hash.h"
extern const struct bch_hash_desc bch2_dirent_hash_desc;
extern const struct bkey_ops bch2_bkey_dirent_ops; extern const struct bkey_ops bch2_bkey_dirent_ops;
struct qstr; struct qstr;

View File

@ -49,3 +49,102 @@ void bch2_nonfatal_io_error(struct bch_dev *ca)
{ {
queue_work(system_long_wq, &ca->io_error_work); queue_work(system_long_wq, &ca->io_error_work);
} }
#ifdef __KERNEL__
#define ask_yn() false
#else
#include "tools-util.h"
#endif
enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
const char *fmt, ...)
{
struct fsck_err_state *s;
va_list args;
bool fix = false, print = true, suppressing = false;
char _buf[sizeof(s->buf)], *buf = _buf;
mutex_lock(&c->fsck_error_lock);
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
goto print;
list_for_each_entry(s, &c->fsck_errors, list)
if (s->fmt == fmt)
goto found;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
if (!c->fsck_alloc_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
c->fsck_alloc_err = true;
buf = _buf;
goto print;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
found:
list_move(&s->list, &c->fsck_errors);
s->nr++;
suppressing = s->nr == 10;
print = s->nr <= 10;
buf = s->buf;
print:
va_start(args, fmt);
vscnprintf(buf, sizeof(_buf), fmt, args);
va_end(args);
if (flags & FSCK_CAN_FIX) {
if (c->opts.fix_errors == FSCK_ERR_ASK) {
printk(KERN_ERR "%s: fix?", buf);
fix = ask_yn();
} else if (c->opts.fix_errors == FSCK_ERR_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
if (print)
bch_err(c, "%s, fixing", buf);
fix = true;
} else {
if (print)
bch_err(c, "%s, not fixing", buf);
fix = false;
}
} else if (flags & FSCK_NEED_FSCK) {
if (print)
bch_err(c, "%s (run fsck to correct)", buf);
} else {
if (print)
bch_err(c, "%s (repair unimplemented)", buf);
}
if (suppressing)
bch_err(c, "Ratelimiting new instances of previous error");
mutex_unlock(&c->fsck_error_lock);
if (fix)
set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
return fix ? FSCK_ERR_FIX
: flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE
: FSCK_ERR_EXIT;
}
void bch2_flush_fsck_errs(struct bch_fs *c)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_lock);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
if (s->nr > 10)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
list_del(&s->list);
kfree(s);
}
mutex_unlock(&c->fsck_error_lock);
}

View File

@ -95,62 +95,38 @@ enum {
BCH_FSCK_UNKNOWN_VERSION = 4, BCH_FSCK_UNKNOWN_VERSION = 4,
}; };
/* These macros return true if error should be fixed: */
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
enum fsck_err_opts { enum fsck_err_opts {
FSCK_ERR_NO, FSCK_ERR_NO,
FSCK_ERR_YES, FSCK_ERR_YES,
FSCK_ERR_ASK, FSCK_ERR_ASK,
}; };
#ifdef __KERNEL__ enum fsck_err_ret {
#define __fsck_err_should_fix(c, msg, ...) \ FSCK_ERR_IGNORE = 0,
({ \ FSCK_ERR_FIX = 1,
bool _fix = (c)->opts.fix_errors; \ FSCK_ERR_EXIT = 2,
bch_err(c, msg ", %sfixing", ##__VA_ARGS__, _fix ? "" : "not ");\ };
_fix; \
})
#else
#include "tools-util.h"
#define __fsck_err_should_fix(c, msg, ...) \ struct fsck_err_state {
({ \ struct list_head list;
bool _fix = false; \ const char *fmt;
switch ((c)->opts.fix_errors) { \ u64 nr;
case FSCK_ERR_ASK: \ char buf[512];
printf(msg ": fix?", ##__VA_ARGS__); \ };
_fix = ask_yn(); \
break; \
case FSCK_ERR_YES: \
bch_err(c, msg ", fixing", ##__VA_ARGS__); \
_fix = true; \
break; \
case FSCK_ERR_NO: \
bch_err(c, msg, ##__VA_ARGS__); \
_fix = false; \
break; \
} \
_fix; \
})
#endif
#define __fsck_err(c, _can_fix, _can_ignore, _nofix_msg, msg, ...) \ #define FSCK_CAN_FIX (1 << 0)
#define FSCK_CAN_IGNORE (1 << 1)
#define FSCK_NEED_FSCK (1 << 2)
enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
unsigned, const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err(c, _flags, msg, ...) \
({ \ ({ \
bool _fix; \ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
\ \
if (_can_fix) { \ if (_fix == FSCK_ERR_EXIT) { \
_fix = __fsck_err_should_fix(c, msg, ##__VA_ARGS__); \
} else { \
bch_err(c, msg " ("_nofix_msg")", ##__VA_ARGS__); \
_fix = false; \
} \
\
if (_fix) \
set_bit(BCH_FS_FSCK_FIXED_ERRORS, &(c)->flags); \
\
if (!_fix && !_can_ignore) { \
bch_err(c, "Unable to continue, halting"); \ bch_err(c, "Unable to continue, halting"); \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \ goto fsck_err; \
@ -159,24 +135,27 @@ enum fsck_err_opts {
_fix; \ _fix; \
}) })
#define __fsck_err_on(cond, c, _can_fix, _can_ignore, _nofix_msg, ...) \ /* These macros return true if error should be fixed: */
((cond) ? __fsck_err(c, _can_fix, _can_ignore, \
_nofix_msg, ##__VA_ARGS__) : false) /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, ...) \
((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define unfixable_fsck_err_on(cond, c, ...) \ #define unfixable_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, false, true, "repair unimplemented", ##__VA_ARGS__) __fsck_err_on(cond, c, FSCK_CAN_IGNORE, ##__VA_ARGS__)
#define need_fsck_err_on(cond, c, ...) \ #define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, false, true, "run fsck to correct", ##__VA_ARGS__) __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define mustfix_fsck_err(c, ...) \ #define mustfix_fsck_err(c, ...) \
__fsck_err(c, true, false, "not fixing", ##__VA_ARGS__) __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define mustfix_fsck_err_on(cond, c, ...) \ #define mustfix_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, true, false, "not fixing", ##__VA_ARGS__) __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define fsck_err_on(cond, c, ...) \ #define fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, true, true, "not fixing", ##__VA_ARGS__) __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
/* /*
* Fatal errors: these don't indicate a bug, but we can't continue running in RW * Fatal errors: these don't indicate a bug, but we can't continue running in RW

View File

@ -5,8 +5,8 @@
#include "clock.h" #include "clock.h"
#include "error.h" #include "error.h"
#include "fs.h" #include "fs.h"
#include "fs-gc.h"
#include "fs-io.h" #include "fs-io.h"
#include "fsck.h"
#include "inode.h" #include "inode.h"
#include "journal.h" #include "journal.h"
#include "io.h" #include "io.h"

View File

@ -7,8 +7,8 @@
#include "dirent.h" #include "dirent.h"
#include "extents.h" #include "extents.h"
#include "fs.h" #include "fs.h"
#include "fs-gc.h"
#include "fs-io.h" #include "fs-io.h"
#include "fsck.h"
#include "inode.h" #include "inode.h"
#include "journal.h" #include "journal.h"
#include "keylist.h" #include "keylist.h"

View File

@ -4,10 +4,11 @@
#include "dirent.h" #include "dirent.h"
#include "error.h" #include "error.h"
#include "fs.h" #include "fs.h"
#include "fs-gc.h" #include "fsck.h"
#include "inode.h" #include "inode.h"
#include "keylist.h" #include "keylist.h"
#include "super.h" #include "super.h"
#include "xattr.h"
#include <linux/dcache.h> /* struct qstr */ #include <linux/dcache.h> /* struct qstr */
#include <linux/generic-radix-tree.h> #include <linux/generic-radix-tree.h>
@ -37,12 +38,16 @@ static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_unlock(iter); bch2_btree_iter_unlock(iter);
ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
if (ret) if (ret) {
bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
goto err; goto err;
}
dir_hash_info = bch2_hash_info_init(c, &dir_inode); dir_hash_info = bch2_hash_info_init(c, &dir_inode);
ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
if (ret)
bch_err(c, "remove_dirent: err %i deleting dirent", ret);
err: err:
kfree(buf); kfree(buf);
return ret; return ret;
@ -108,6 +113,118 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
return 0; return 0;
} }
struct hash_check {
struct bch_hash_info info;
struct btree_iter chain;
struct btree_iter iter;
u64 next;
};
static void hash_check_init(const struct bch_hash_desc desc,
struct hash_check *h, struct bch_fs *c)
{
bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN);
bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN);
}
static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
const struct bch_inode_unpacked *bi)
{
h->info = bch2_hash_info_init(c, bi);
h->next = -1;
}
static int hash_redo_key(const struct bch_hash_desc desc,
struct hash_check *h, struct bch_fs *c,
struct btree_iter *k_iter, struct bkey_s_c k,
u64 hashed)
{
struct bkey_i *tmp;
int ret = 0;
tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
if (!tmp)
return -ENOMEM;
bkey_reassemble(tmp, k);
ret = bch2_btree_delete_at(k_iter, 0);
if (ret)
goto err;
bch2_btree_iter_unlock(k_iter);
bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL,
tmp, BCH_HASH_SET_MUST_CREATE);
err:
kfree(tmp);
return ret;
}
static int hash_check_key(const struct bch_hash_desc desc,
struct hash_check *h, struct bch_fs *c,
struct btree_iter *k_iter, struct bkey_s_c k)
{
char buf[200];
u64 hashed;
int ret = 0;
if (k.k->type != desc.whiteout_type &&
k.k->type != desc.key_type)
return 0;
if (k.k->p.offset != h->next) {
if (!btree_iter_linked(&h->chain)) {
bch2_btree_iter_link(k_iter, &h->chain);
bch2_btree_iter_link(k_iter, &h->iter);
}
bch2_btree_iter_copy(&h->chain, k_iter);
}
h->next = k.k->p.offset + 1;
if (k.k->type != desc.key_type)
return 0;
hashed = desc.hash_bkey(&h->info, k);
if (fsck_err_on(hashed < h->chain.pos.offset ||
hashed > k.k->p.offset, c,
"hash table key at wrong offset: %llu, "
"hashed to %llu chain starts at %llu\n%s",
k.k->p.offset, hashed, h->chain.pos.offset,
bch2_bkey_val_to_text(c, desc.btree_id,
buf, sizeof(buf), k))) {
ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
if (ret) {
bch_err(c, "hash_redo_key err %i", ret);
return ret;
}
return 1;
}
if (!bkey_cmp(h->chain.pos, k_iter->pos))
return 0;
bch2_btree_iter_copy(&h->iter, &h->chain);
while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
if (fsck_err_on(k2.k->type == desc.key_type &&
!desc.cmp_bkey(k, k2), c,
"duplicate hash table keys:\n%s",
bch2_bkey_val_to_text(c, desc.btree_id,
buf, sizeof(buf), k))) {
ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
if (ret)
return ret;
return 1;
}
bch2_btree_iter_advance_pos(&h->iter);
}
fsck_err:
return ret;
}
/* /*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and * Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent * that i_size an i_sectors are consistent
@ -130,14 +247,18 @@ static int check_extents(struct bch_fs *c)
if (ret) if (ret)
break; break;
unfixable_fsck_err_on(!w.have_inode, c, if (fsck_err_on(!w.have_inode, c,
"extent type %u for missing inode %llu", "extent type %u for missing inode %llu",
k.k->type, k.k->p.inode); k.k->type, k.k->p.inode) ||
fsck_err_on(w.have_inode &&
unfixable_fsck_err_on(w.have_inode &&
!S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c, !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c,
"extent type %u for non regular file, inode %llu mode %o", "extent type %u for non regular file, inode %llu mode %o",
k.k->type, k.k->p.inode, w.inode.i_mode); k.k->type, k.k->p.inode, w.inode.i_mode)) {
ret = bch2_btree_delete_at(&iter, 0);
if (ret)
goto err;
continue;
}
unfixable_fsck_err_on(w.first_this_inode && unfixable_fsck_err_on(w.first_this_inode &&
w.have_inode && w.have_inode &&
@ -154,6 +275,7 @@ static int check_extents(struct bch_fs *c)
"extent type %u offset %llu past end of inode %llu, i_size %llu", "extent type %u offset %llu past end of inode %llu, i_size %llu",
k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size); k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size);
} }
err:
fsck_err: fsck_err:
return bch2_btree_iter_unlock(&iter) ?: ret; return bch2_btree_iter_unlock(&iter) ?: ret;
} }
@ -166,10 +288,15 @@ noinline_for_stack
static int check_dirents(struct bch_fs *c) static int check_dirents(struct bch_fs *c)
{ {
struct inode_walker w = inode_walker_init(); struct inode_walker w = inode_walker_init();
struct hash_check h;
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
unsigned name_len;
char buf[200];
int ret = 0; int ret = 0;
hash_check_init(bch2_dirent_hash_desc, &h, c);
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
POS(BCACHE_ROOT_INO, 0), k) { POS(BCACHE_ROOT_INO, 0), k) {
struct bkey_s_c_dirent d; struct bkey_s_c_dirent d;
@ -181,13 +308,32 @@ static int check_dirents(struct bch_fs *c)
if (ret) if (ret)
break; break;
unfixable_fsck_err_on(!w.have_inode, c, if (fsck_err_on(!w.have_inode, c,
"dirent in nonexisting directory %llu", "dirent in nonexisting directory:\n%s",
k.k->p.inode); bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k)) ||
fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
"dirent in non directory inode type %u:\n%s",
mode_to_type(w.inode.i_mode),
bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
ret = bch2_btree_delete_at(&iter, 0);
if (ret)
goto err;
continue;
}
unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c, if (w.first_this_inode && w.have_inode)
"dirent in non directory inode %llu, type %u", hash_check_set_inode(&h, c, &w.inode);
k.k->p.inode, mode_to_type(w.inode.i_mode));
ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
if (ret > 0) {
ret = 0;
continue;
}
if (ret)
goto fsck_err;
if (k.k->type != BCH_DIRENT) if (k.k->type != BCH_DIRENT)
continue; continue;
@ -195,8 +341,25 @@ static int check_dirents(struct bch_fs *c)
d = bkey_s_c_to_dirent(k); d = bkey_s_c_to_dirent(k);
d_inum = le64_to_cpu(d.v->d_inum); d_inum = le64_to_cpu(d.v->d_inum);
name_len = bch2_dirent_name_bytes(d);
if (fsck_err_on(!name_len, c, "empty dirent") ||
fsck_err_on(name_len == 1 &&
!memcmp(d.v->d_name, ".", 1), c,
". dirent") ||
fsck_err_on(name_len == 2 &&
!memcmp(d.v->d_name, "..", 2), c,
".. dirent")) {
ret = remove_dirent(c, &iter, d);
if (ret)
goto err;
continue;
}
if (fsck_err_on(d_inum == d.k->p.inode, c, if (fsck_err_on(d_inum == d.k->p.inode, c,
"dirent points to own directory")) { "dirent points to own directory:\n%s",
bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
ret = remove_dirent(c, &iter, d); ret = remove_dirent(c, &iter, d);
if (ret) if (ret)
goto err; goto err;
@ -211,8 +374,9 @@ static int check_dirents(struct bch_fs *c)
ret = 0; ret = 0;
if (fsck_err_on(!have_target, c, if (fsck_err_on(!have_target, c,
"dirent points to missing inode %llu, type %u filename %s", "dirent points to missing inode:\n%s",
d_inum, d.v->d_type, d.v->d_name)) { bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
ret = remove_dirent(c, &iter, d); ret = remove_dirent(c, &iter, d);
if (ret) if (ret)
goto err; goto err;
@ -222,10 +386,10 @@ static int check_dirents(struct bch_fs *c)
if (fsck_err_on(have_target && if (fsck_err_on(have_target &&
d.v->d_type != d.v->d_type !=
mode_to_type(le16_to_cpu(target.i_mode)), c, mode_to_type(le16_to_cpu(target.i_mode)), c,
"incorrect d_type: got %u should be %u, filename %s", "incorrect d_type: should be %u:\n%s",
d.v->d_type,
mode_to_type(le16_to_cpu(target.i_mode)), mode_to_type(le16_to_cpu(target.i_mode)),
d.v->d_name)) { bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
struct bkey_i_dirent *n; struct bkey_i_dirent *n;
n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
@ -248,6 +412,8 @@ static int check_dirents(struct bch_fs *c)
} }
err: err:
fsck_err: fsck_err:
bch2_btree_iter_unlock(&h.chain);
bch2_btree_iter_unlock(&h.iter);
return bch2_btree_iter_unlock(&iter) ?: ret; return bch2_btree_iter_unlock(&iter) ?: ret;
} }
@ -258,21 +424,39 @@ noinline_for_stack
static int check_xattrs(struct bch_fs *c) static int check_xattrs(struct bch_fs *c)
{ {
struct inode_walker w = inode_walker_init(); struct inode_walker w = inode_walker_init();
struct hash_check h;
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
int ret = 0; int ret = 0;
hash_check_init(bch2_xattr_hash_desc, &h, c);
for_each_btree_key(&iter, c, BTREE_ID_XATTRS, for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
POS(BCACHE_ROOT_INO, 0), k) { POS(BCACHE_ROOT_INO, 0), k) {
ret = walk_inode(c, &w, k.k->p.inode); ret = walk_inode(c, &w, k.k->p.inode);
if (ret) if (ret)
break; break;
unfixable_fsck_err_on(!w.have_inode, c, if (fsck_err_on(!w.have_inode, c,
"xattr for missing inode %llu", "xattr for missing inode %llu",
k.k->p.inode); k.k->p.inode)) {
ret = bch2_btree_delete_at(&iter, 0);
if (ret)
goto err;
continue;
}
if (w.first_this_inode && w.have_inode)
hash_check_set_inode(&h, c, &w.inode);
ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
if (ret)
goto fsck_err;
} }
err:
fsck_err: fsck_err:
bch2_btree_iter_unlock(&h.chain);
bch2_btree_iter_unlock(&h.iter);
return bch2_btree_iter_unlock(&iter) ?: ret; return bch2_btree_iter_unlock(&iter) ?: ret;
} }
@ -445,6 +629,8 @@ static int check_directory_structure(struct bch_fs *c,
/* DFS: */ /* DFS: */
restart_dfs: restart_dfs:
had_unreachable = false;
ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO); ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO);
if (ret) if (ret)
goto err; goto err;
@ -478,7 +664,8 @@ next:
d_inum = le64_to_cpu(dirent.v->d_inum); d_inum = le64_to_cpu(dirent.v->d_inum);
if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
"directory with multiple hardlinks")) { "directory %llu has multiple hardlinks",
d_inum)) {
ret = remove_dirent(c, &iter, dirent); ret = remove_dirent(c, &iter, dirent);
if (ret) if (ret)
goto err; goto err;
@ -503,8 +690,6 @@ up:
path.nr--; path.nr--;
} }
had_unreachable = false;
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
if (k.k->type != BCH_INODE_FS || if (k.k->type != BCH_INODE_FS ||
!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode))) !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode)))
@ -640,7 +825,7 @@ static int bch2_gc_do_inode(struct bch_fs *c,
ret = bch2_inode_unpack(inode, &u); ret = bch2_inode_unpack(inode, &u);
if (bch2_fs_inconsistent_on(ret, c, if (bch2_fs_inconsistent_on(ret, c,
"error unpacking inode %llu in fs-gc", "error unpacking inode %llu in fsck",
inode.k->p.inode)) inode.k->p.inode))
return ret; return ret;
@ -894,36 +1079,59 @@ int bch2_fsck(struct bch_fs *c, bool full_fsck)
struct bch_inode_unpacked root_inode, lostfound_inode; struct bch_inode_unpacked root_inode, lostfound_inode;
int ret; int ret;
ret = check_root(c, &root_inode); if (full_fsck) {
if (ret) bch_verbose(c, "checking extents");
return ret; ret = check_extents(c);
if (ret)
return ret;
ret = check_lostfound(c, &root_inode, &lostfound_inode); bch_verbose(c, "checking dirents");
if (ret) ret = check_dirents(c);
return ret; if (ret)
return ret;
if (!full_fsck) bch_verbose(c, "checking xattrs");
goto check_nlinks; ret = check_xattrs(c);
if (ret)
return ret;
ret = check_extents(c); bch_verbose(c, "checking root directory");
if (ret) ret = check_root(c, &root_inode);
return ret; if (ret)
return ret;
ret = check_dirents(c); bch_verbose(c, "checking lost+found");
if (ret) ret = check_lostfound(c, &root_inode, &lostfound_inode);
return ret; if (ret)
return ret;
ret = check_xattrs(c); bch_verbose(c, "checking directory structure");
if (ret) ret = check_directory_structure(c, &lostfound_inode);
return ret; if (ret)
return ret;
ret = check_directory_structure(c, &lostfound_inode); bch_verbose(c, "checking inode nlinks");
if (ret) ret = check_inode_nlinks(c, &lostfound_inode);
return ret; if (ret)
check_nlinks: return ret;
ret = check_inode_nlinks(c, &lostfound_inode); } else {
if (ret) bch_verbose(c, "checking root directory");
return ret; ret = check_root(c, &root_inode);
if (ret)
return ret;
bch_verbose(c, "checking lost+found");
ret = check_lostfound(c, &root_inode, &lostfound_inode);
if (ret)
return ret;
bch_verbose(c, "checking inode nlinks");
ret = check_inode_nlinks(c, &lostfound_inode);
if (ret)
return ret;
}
bch2_flush_fsck_errs(c);
return 0; return 0;
} }

View File

@ -25,14 +25,12 @@ static const u8 bits_table[8] = {
13 * 8 - 8, 13 * 8 - 8,
}; };
static int inode_encode_field(u8 *out, u8 *end, const u64 in[2]) static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
{ {
unsigned bytes, bits, shift; __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
unsigned shift, bytes, bits = likely(!hi)
if (likely(!in[1])) ? fls64(lo)
bits = fls64(in[0]); : fls64(hi) + 64;
else
bits = fls64(in[1]) + 64;
for (shift = 1; shift <= 8; shift++) for (shift = 1; shift <= 8; shift++)
if (bits < bits_table[shift - 1]) if (bits < bits_table[shift - 1])
@ -44,17 +42,7 @@ got_shift:
BUG_ON(out + bytes > end); BUG_ON(out + bytes > end);
if (likely(bytes <= 8)) { memcpy(out, (u8 *) in + 16 - bytes, bytes);
u64 b = cpu_to_be64(in[0]);
memcpy(out, (void *) &b + 8 - bytes, bytes);
} else {
u64 b = cpu_to_be64(in[1]);
memcpy(out, (void *) &b + 16 - bytes, bytes);
put_unaligned_be64(in[0], out + bytes - 8);
}
*out |= (1 << 8) >> shift; *out |= (1 << 8) >> shift;
return bytes; return bytes;
@ -63,7 +51,9 @@ got_shift:
static int inode_decode_field(const u8 *in, const u8 *end, static int inode_decode_field(const u8 *in, const u8 *end,
u64 out[2], unsigned *out_bits) u64 out[2], unsigned *out_bits)
{ {
unsigned bytes, bits, shift; __be64 be[2] = { 0, 0 };
unsigned bytes, shift;
u8 *p;
if (in >= end) if (in >= end)
return -1; return -1;
@ -77,29 +67,18 @@ static int inode_decode_field(const u8 *in, const u8 *end,
*/ */
shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
bytes = byte_table[shift - 1]; bytes = byte_table[shift - 1];
bits = bytes * 8 - shift;
if (in + bytes > end) if (in + bytes > end)
return -1; return -1;
/* p = (u8 *) be + 16 - bytes;
* we're assuming it's safe to deref up to 7 bytes < in; this will work memcpy(p, in, bytes);
* because keys always start quite a bit more than 7 bytes after the *p ^= (1 << 8) >> shift;
* start of the btree node header:
*/ out[0] = be64_to_cpu(be[0]);
if (likely(bytes <= 8)) { out[1] = be64_to_cpu(be[1]);
out[0] = get_unaligned_be64(in + bytes - 8); *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
out[0] <<= 64 - bits;
out[0] >>= 64 - bits;
out[1] = 0;
} else {
out[0] = get_unaligned_be64(in + bytes - 8);
out[1] = get_unaligned_be64(in + bytes - 16);
out[1] <<= 128 - bits;
out[1] >>= 128 - bits;
}
*out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]);
return bytes; return bytes;
} }
@ -109,7 +88,6 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
u8 *out = packed->inode.v.fields; u8 *out = packed->inode.v.fields;
u8 *end = (void *) &packed[1]; u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out; u8 *last_nonzero_field = out;
u64 field[2];
unsigned nr_fields = 0, last_nonzero_fieldnr = 0; unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
bkey_inode_init(&packed->inode.k_i); bkey_inode_init(&packed->inode.k_i);
@ -119,12 +97,10 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
packed->inode.v.i_mode = cpu_to_le16(inode->i_mode); packed->inode.v.i_mode = cpu_to_le16(inode->i_mode);
#define BCH_INODE_FIELD(_name, _bits) \ #define BCH_INODE_FIELD(_name, _bits) \
field[0] = inode->_name; \ out += inode_encode_field(out, end, 0, inode->_name); \
field[1] = 0; \
out += inode_encode_field(out, end, field); \
nr_fields++; \ nr_fields++; \
\ \
if (field[0] | field[1]) { \ if (inode->_name) { \
last_nonzero_field = out; \ last_nonzero_field = out; \
last_nonzero_fieldnr = nr_fields; \ last_nonzero_fieldnr = nr_fields; \
} }
@ -187,7 +163,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
if (field_bits > sizeof(unpacked->_name) * 8) \ if (field_bits > sizeof(unpacked->_name) * 8) \
return -1; \ return -1; \
\ \
unpacked->_name = field[0]; \ unpacked->_name = field[1]; \
in += ret; in += ret;
BCH_INODE_FIELDS() BCH_INODE_FIELDS()
@ -449,3 +425,32 @@ int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid,
bch2_btree_iter_unlock(&iter); bch2_btree_iter_unlock(&iter);
return -ENOENT; return -ENOENT;
} }
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void)
{
struct bch_inode_unpacked *u, test_inodes[] = {
{
.i_atime = U64_MAX,
.i_ctime = U64_MAX,
.i_mtime = U64_MAX,
.i_otime = U64_MAX,
.i_size = U64_MAX,
.i_sectors = U64_MAX,
.i_uid = U32_MAX,
.i_gid = U32_MAX,
.i_nlink = U32_MAX,
.i_generation = U32_MAX,
.i_dev = U32_MAX,
},
};
for (u = test_inodes;
u < test_inodes + ARRAY_SIZE(test_inodes);
u++) {
struct bkey_inode_buf p;
bch2_inode_pack(&p, u);
}
}
#endif

View File

@ -54,4 +54,10 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
return div_s64(ns, c->sb.time_precision); return div_s64(ns, c->sb.time_precision);
} }
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void);
#else
static inline void bch2_inode_pack_test(void) {}
#endif
#endif #endif

View File

@ -910,8 +910,8 @@ static int bio_checksum_uncompress(struct bch_fs *c,
bch2_encrypt_bio(c, rbio->crc.csum_type, bch2_encrypt_bio(c, rbio->crc.csum_type,
nonce, src); nonce, src);
bio_copy_data_iter(dst, dst_iter, bio_copy_data_iter(dst, &dst_iter,
src, src->bi_iter); src, &src->bi_iter);
} else { } else {
bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src); bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
} }

View File

@ -527,62 +527,34 @@ fsck_err:
#define JOURNAL_ENTRY_NONE 6 #define JOURNAL_ENTRY_NONE 6
#define JOURNAL_ENTRY_BAD 7 #define JOURNAL_ENTRY_BAD 7
static int journal_entry_validate(struct bch_fs *c, #define journal_entry_err(c, msg, ...) \
struct jset *j, u64 sector, ({ \
unsigned bucket_sectors_left, if (write == READ) { \
unsigned sectors_read) mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
} else { \
bch_err(c, "detected corrupt metadata before write:\n" \
msg, ##__VA_ARGS__); \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
true; \
})
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
int write)
{ {
struct jset_entry *entry; struct jset_entry *entry;
size_t bytes = vstruct_bytes(j);
struct bch_csum csum;
int ret = 0; int ret = 0;
if (le64_to_cpu(j->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
bch_err(c, "unknown journal entry version %u",
le32_to_cpu(j->version));
return BCH_FSCK_UNKNOWN_VERSION;
}
if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
"journal entry too big (%zu bytes), sector %lluu",
bytes, sector)) {
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
if (bytes > sectors_read << 9)
return JOURNAL_ENTRY_REREAD;
if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
"journal entry with unknown csum type %llu sector %lluu",
JSET_CSUM_TYPE(j), sector))
return JOURNAL_ENTRY_BAD;
csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
if (mustfix_fsck_err_on(bch2_crc_cmp(csum, j->csum), c,
"journal checksum bad, sector %llu", sector)) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
"invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
vstruct_for_each(j, entry) { vstruct_for_each(j, entry) {
struct bkey_i *k; struct bkey_i *k;
if (mustfix_fsck_err_on(vstruct_next(entry) > if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(j), c, vstruct_last(j), c,
"journal entry extents past end of jset")) { "journal entry extends past end of jset")) {
j->u64s = cpu_to_le64((u64 *) entry - j->_data); j->u64s = cpu_to_le64((u64 *) entry - j->_data);
break; break;
} }
@ -602,7 +574,7 @@ static int journal_entry_validate(struct bch_fs *c,
case JOURNAL_ENTRY_BTREE_ROOT: case JOURNAL_ENTRY_BTREE_ROOT:
k = entry->start; k = entry->start;
if (mustfix_fsck_err_on(!entry->u64s || if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s, c, le16_to_cpu(entry->u64s) != k->k.u64s, c,
"invalid btree root journal entry: wrong number of keys")) { "invalid btree root journal entry: wrong number of keys")) {
journal_entry_null_range(entry, journal_entry_null_range(entry,
@ -620,7 +592,7 @@ static int journal_entry_validate(struct bch_fs *c,
break; break;
case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED: case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c, if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
"invalid journal seq blacklist entry: bad size")) { "invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, journal_entry_null_range(entry,
vstruct_next(entry)); vstruct_next(entry));
@ -628,7 +600,7 @@ static int journal_entry_validate(struct bch_fs *c,
break; break;
default: default:
mustfix_fsck_err(c, "invalid journal entry type %llu", journal_entry_err(c, "invalid journal entry type %llu",
JOURNAL_ENTRY_TYPE(entry)); JOURNAL_ENTRY_TYPE(entry));
journal_entry_null_range(entry, vstruct_next(entry)); journal_entry_null_range(entry, vstruct_next(entry));
break; break;
@ -639,6 +611,61 @@ fsck_err:
return ret; return ret;
} }
static int journal_entry_validate(struct bch_fs *c,
struct jset *j, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read,
int write)
{
size_t bytes = vstruct_bytes(j);
struct bch_csum csum;
int ret = 0;
if (le64_to_cpu(j->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
bch_err(c, "unknown journal entry version %u",
le32_to_cpu(j->version));
return BCH_FSCK_UNKNOWN_VERSION;
}
if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
"journal entry too big (%zu bytes), sector %lluu",
bytes, sector)) {
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
if (bytes > sectors_read << 9)
return JOURNAL_ENTRY_REREAD;
if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
"journal entry with unknown csum type %llu sector %lluu",
JSET_CSUM_TYPE(j), sector))
return JOURNAL_ENTRY_BAD;
csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
if (journal_entry_err_on(bch2_crc_cmp(csum, j->csum), c,
"journal checksum bad, sector %llu", sector)) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
if (journal_entry_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
"invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
return __journal_entry_validate(c, j, write);
fsck_err:
return ret;
}
struct journal_read_buf { struct journal_read_buf {
void *data; void *data;
size_t size; size_t size;
@ -705,7 +732,8 @@ reread: sectors_read = min_t(unsigned,
} }
ret = journal_entry_validate(c, j, offset, ret = journal_entry_validate(c, j, offset,
end - offset, sectors_read); end - offset, sectors_read,
READ);
switch (ret) { switch (ret) {
case BCH_FSCK_OK: case BCH_FSCK_OK:
break; break;
@ -2274,6 +2302,10 @@ static void journal_write(struct closure *cl)
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
__journal_entry_validate(c, jset, WRITE))
goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start, jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start); vstruct_end(jset) - (void *) jset->encrypted_start);
@ -2281,6 +2313,10 @@ static void journal_write(struct closure *cl)
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset); journal_nonce(jset), jset);
if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
__journal_entry_validate(c, jset, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits); sectors = vstruct_sectors(jset, c->block_bits);
BUG_ON(sectors > j->prev_buf_sectors); BUG_ON(sectors > j->prev_buf_sectors);
@ -2349,6 +2385,9 @@ no_io:
ptr->offset += sectors; ptr->offset += sectors;
closure_return_with_destructor(cl, journal_write_done); closure_return_with_destructor(cl, journal_write_done);
err:
bch2_fatal_error(c);
closure_return_with_destructor(cl, journal_write_done);
} }
static void journal_write_work(struct work_struct *work) static void journal_write_work(struct work_struct *work)

View File

@ -2,7 +2,9 @@
#define _BCACHE_STR_HASH_H #define _BCACHE_STR_HASH_H
#include "btree_iter.h" #include "btree_iter.h"
#include "btree_update.h"
#include "checksum.h" #include "checksum.h"
#include "error.h"
#include "inode.h" #include "inode.h"
#include "siphash.h" #include "siphash.h"
#include "super.h" #include "super.h"
@ -341,6 +343,36 @@ err:
return ret; return ret;
} }
static inline int bch2_hash_delete_at(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter,
u64 *journal_seq)
{
struct btree_iter whiteout_iter;
struct bkey_i delete;
int ret = -ENOENT;
bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
iter->pos);
bch2_btree_iter_link(iter, &whiteout_iter);
ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
if (ret < 0)
goto err;
bkey_init(&delete.k);
delete.k.p = iter->pos;
delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(iter, &delete));
err:
bch2_btree_iter_unlink(&whiteout_iter);
return ret;
}
static inline int bch2_hash_delete(const struct bch_hash_desc desc, static inline int bch2_hash_delete(const struct bch_hash_desc desc,
const struct bch_hash_info *info, const struct bch_hash_info *info,
struct bch_fs *c, u64 inode, struct bch_fs *c, u64 inode,
@ -348,7 +380,6 @@ static inline int bch2_hash_delete(const struct bch_hash_desc desc,
{ {
struct btree_iter iter, whiteout_iter; struct btree_iter iter, whiteout_iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_i delete;
int ret = -ENOENT; int ret = -ENOENT;
bch2_btree_iter_init_intent(&iter, c, desc.btree_id, bch2_btree_iter_init_intent(&iter, c, desc.btree_id,
@ -361,18 +392,7 @@ retry:
if ((ret = btree_iter_err(k))) if ((ret = btree_iter_err(k)))
goto err; goto err;
ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, &iter); ret = bch2_hash_delete_at(desc, info, &iter, journal_seq);
if (ret < 0)
goto err;
bkey_init(&delete.k);
delete.k.p = k.k->p;
delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(&iter, &delete));
err: err:
if (ret == -EINTR) if (ret == -EINTR)
goto retry; goto retry;

View File

@ -19,7 +19,7 @@
#include "debug.h" #include "debug.h"
#include "error.h" #include "error.h"
#include "fs.h" #include "fs.h"
#include "fs-gc.h" #include "fsck.h"
#include "inode.h" #include "inode.h"
#include "io.h" #include "io.h"
#include "journal.h" #include "journal.h"
@ -513,6 +513,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->read_retry_work, bch2_read_retry_work); INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
mutex_init(&c->zlib_workspace_lock); mutex_init(&c->zlib_workspace_lock);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
seqcount_init(&c->gc_pos_lock); seqcount_init(&c->gc_pos_lock);
c->prio_clock[READ].hand = 1; c->prio_clock[READ].hand = 1;
@ -875,12 +878,12 @@ err:
switch (ret) { switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED: case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers"); bch_err(c, "filesystem contains errors: please report this to the developers");
pr_cont("mount with -o fix_errors to repair"); pr_cont("mount with -o fix_errors to repair\n");
err = "fsck error"; err = "fsck error";
break; break;
case BCH_FSCK_REPAIR_UNIMPLEMENTED: case BCH_FSCK_REPAIR_UNIMPLEMENTED:
bch_err(c, "filesystem contains errors: please report this to the developers"); bch_err(c, "filesystem contains errors: please report this to the developers");
pr_cont("repair unimplemented: inform the developers so that it can be added"); pr_cont("repair unimplemented: inform the developers so that it can be added\n");
err = "fsck error"; err = "fsck error";
break; break;
case BCH_FSCK_REPAIR_IMPOSSIBLE: case BCH_FSCK_REPAIR_IMPOSSIBLE:
@ -979,8 +982,8 @@ static void bch2_dev_free(struct bch_dev *ca)
kvpfree(ca->disk_buckets, bucket_bytes(ca)); kvpfree(ca->disk_buckets, bucket_bytes(ca));
kfree(ca->prio_buckets); kfree(ca->prio_buckets);
kfree(ca->bio_prio); kfree(ca->bio_prio);
vfree(ca->buckets); kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket));
vfree(ca->oldest_gens); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
free_heap(&ca->heap); free_heap(&ca->heap);
free_fifo(&ca->free_inc); free_fifo(&ca->free_inc);
@ -1140,10 +1143,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
!init_heap(&ca->heap, heap_size, GFP_KERNEL) || !init_heap(&ca->heap, heap_size, GFP_KERNEL) ||
!(ca->oldest_gens = vzalloc(sizeof(u8) * !(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets *
ca->mi.nbuckets)) || sizeof(u8),
!(ca->buckets = vzalloc(sizeof(struct bucket) * GFP_KERNEL|__GFP_ZERO)) ||
ca->mi.nbuckets)) || !(ca->buckets = kvpmalloc(ca->mi.nbuckets *
sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
!(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
2, GFP_KERNEL)) || 2, GFP_KERNEL)) ||
!(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) || !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
@ -1871,6 +1876,7 @@ static void bcachefs_exit(void)
static int __init bcachefs_init(void) static int __init bcachefs_init(void)
{ {
bch2_bkey_pack_test(); bch2_bkey_pack_test();
bch2_inode_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
bch2_chardev_init() || bch2_chardev_init() ||

View File

@ -512,7 +512,7 @@ STORE(bch2_fs_opts_dir)
{ {
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt; const struct bch_option *opt;
enum bch_opt_id id; int id;
u64 v; u64 v;
id = bch2_parse_sysfs_opt(attr->name, buf, &v); id = bch2_parse_sysfs_opt(attr->name, buf, &v);

View File

@ -417,3 +417,17 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
dst += bv.bv_len; dst += bv.bv_len;
} }
} }
size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
{
size_t n;
if (!size)
return 0;
n = min(size - 1, len);
memcpy(buf, src, n);
buf[n] = '\0';
return n;
}

View File

@ -93,7 +93,8 @@ static inline void kvpfree(void *p, size_t size)
static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
{ {
return size < PAGE_SIZE ? kmalloc(size, gfp_mask) return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
: (void *) __get_free_pages(gfp_mask, get_order(size)) : (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size))
?: __vmalloc(size, gfp_mask, PAGE_KERNEL); ?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
} }
@ -750,4 +751,6 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio,
#define bio_for_each_contig_segment(bv, bio, iter) \ #define bio_for_each_contig_segment(bv, bio, iter) \
__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
#endif /* _BCACHE_UTIL_H */ #endif /* _BCACHE_UTIL_H */

View File

@ -11,6 +11,16 @@
#include <linux/posix_acl_xattr.h> #include <linux/posix_acl_xattr.h>
#include <linux/xattr.h> #include <linux/xattr.h>
static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
{
return DIV_ROUND_UP(sizeof(struct bch_xattr) +
name_len + val_len, sizeof(u64));
}
#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)
static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
struct xattr_search_key { struct xattr_search_key {
u8 type; u8 type;
struct qstr name; struct qstr name;
@ -31,8 +41,6 @@ static u64 bch2_xattr_hash(const struct bch_hash_info *info,
return bch2_str_hash_end(&ctx, info); return bch2_str_hash_end(&ctx, info);
} }
#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)
static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
{ {
return bch2_xattr_hash(info, key); return bch2_xattr_hash(info, key);
@ -66,7 +74,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
} }
static const struct bch_hash_desc xattr_hash_desc = { const struct bch_hash_desc bch2_xattr_hash_desc = {
.btree_id = BTREE_ID_XATTRS, .btree_id = BTREE_ID_XATTRS,
.key_type = BCH_XATTR, .key_type = BCH_XATTR,
.whiteout_type = BCH_XATTR_WHITEOUT, .whiteout_type = BCH_XATTR_WHITEOUT,
@ -79,12 +87,33 @@ static const struct bch_hash_desc xattr_hash_desc = {
static const char *bch2_xattr_invalid(const struct bch_fs *c, static const char *bch2_xattr_invalid(const struct bch_fs *c,
struct bkey_s_c k) struct bkey_s_c k)
{ {
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
unsigned u64s;
switch (k.k->type) { switch (k.k->type) {
case BCH_XATTR: case BCH_XATTR:
return bkey_val_bytes(k.k) < sizeof(struct bch_xattr) if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
? "value too small" return "value too small";
: NULL;
xattr = bkey_s_c_to_xattr(k);
u64s = xattr_val_u64s(xattr.v->x_name_len,
le16_to_cpu(xattr.v->x_val_len));
if (bkey_val_u64s(k.k) < u64s)
return "value too small";
if (bkey_val_u64s(k.k) > u64s)
return "value too big";
handler = bch2_xattr_type_to_handler(xattr.v->x_type);
if (!handler)
return "invalid type";
if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
return "xattr name has invalid characters";
return NULL;
case BCH_XATTR_WHITEOUT: case BCH_XATTR_WHITEOUT:
return bkey_val_bytes(k.k) != 0 return bkey_val_bytes(k.k) != 0
? "value size should be zero" ? "value size should be zero"
@ -98,34 +127,29 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c,
static void bch2_xattr_to_text(struct bch_fs *c, char *buf, static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k) size_t size, struct bkey_s_c k)
{ {
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr; struct bkey_s_c_xattr xattr;
int n; size_t n = 0;
switch (k.k->type) { switch (k.k->type) {
case BCH_XATTR: case BCH_XATTR:
xattr = bkey_s_c_to_xattr(k); xattr = bkey_s_c_to_xattr(k);
if (size) { handler = bch2_xattr_type_to_handler(xattr.v->x_type);
n = min_t(unsigned, size, xattr.v->x_name_len); if (handler && handler->prefix)
memcpy(buf, xattr.v->x_name, n); n += scnprintf(buf + n, size - n, "%s", handler->prefix);
buf[size - 1] = '\0'; else if (handler)
buf += n; n += scnprintf(buf + n, size - n, "(type %u)",
size -= n; xattr.v->x_type);
} else
n += scnprintf(buf + n, size - n, "(unknown type %u)",
n = scnprintf(buf, size, " -> "); xattr.v->x_type);
buf += n;
size -= n;
if (size) {
n = min_t(unsigned, size,
le16_to_cpu(xattr.v->x_val_len));
memcpy(buf, xattr_val(xattr.v), n);
buf[size - 1] = '\0';
buf += n;
size -= n;
}
n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
xattr.v->x_name_len);
n += scnprintf(buf + n, size - n, ":");
n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
break; break;
case BCH_XATTR_WHITEOUT: case BCH_XATTR_WHITEOUT:
scnprintf(buf, size, "whiteout"); scnprintf(buf, size, "whiteout");
@ -147,7 +171,7 @@ int bch2_xattr_get(struct bch_fs *c, struct inode *inode,
struct bkey_s_c_xattr xattr; struct bkey_s_c_xattr xattr;
int ret; int ret;
k = bch2_hash_lookup(xattr_hash_desc, &ei->str_hash, c, k = bch2_hash_lookup(bch2_xattr_hash_desc, &ei->str_hash, c,
ei->vfs_inode.i_ino, &iter, ei->vfs_inode.i_ino, &iter,
&X_SEARCH(type, name, strlen(name))); &X_SEARCH(type, name, strlen(name)));
if (IS_ERR(k.k)) if (IS_ERR(k.k))
@ -175,15 +199,13 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum,
int ret; int ret;
if (!value) { if (!value) {
ret = bch2_hash_delete(xattr_hash_desc, hash_info, ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info,
c, inum, c, inum,
journal_seq, &search); journal_seq, &search);
} else { } else {
struct bkey_i_xattr *xattr; struct bkey_i_xattr *xattr;
unsigned u64s = BKEY_U64s + unsigned u64s = BKEY_U64s +
DIV_ROUND_UP(sizeof(struct bch_xattr) + xattr_val_u64s(search.name.len, size);
search.name.len + size,
sizeof(u64));
if (u64s > U8_MAX) if (u64s > U8_MAX)
return -ERANGE; return -ERANGE;
@ -200,7 +222,7 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum,
memcpy(xattr->v.x_name, search.name.name, search.name.len); memcpy(xattr->v.x_name, search.name.name, search.name.len);
memcpy(xattr_val(&xattr->v), value, size); memcpy(xattr_val(&xattr->v), value, size);
ret = bch2_hash_set(xattr_hash_desc, hash_info, c, ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c,
inum, journal_seq, inum, journal_seq,
&xattr->k_i, &xattr->k_i,
(flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
@ -225,8 +247,6 @@ int bch2_xattr_set(struct bch_fs *c, struct inode *inode,
&ei->journal_seq); &ei->journal_seq);
} }
static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
static size_t bch2_xattr_emit(struct dentry *dentry, static size_t bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr, const struct bch_xattr *xattr,
char *buffer, size_t buffer_size) char *buffer, size_t buffer_size)

View File

@ -1,6 +1,9 @@
#ifndef _BCACHE_XATTR_H #ifndef _BCACHE_XATTR_H
#define _BCACHE_XATTR_H #define _BCACHE_XATTR_H
#include "str_hash.h"
extern const struct bch_hash_desc bch2_xattr_hash_desc;
extern const struct bkey_ops bch2_bkey_xattr_ops; extern const struct bkey_ops bch2_bkey_xattr_ops;
struct dentry; struct dentry;

View File

@ -21,32 +21,16 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/export.h> #include <linux/export.h>
void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter, void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter src_iter) struct bio *src, struct bvec_iter *src_iter)
{ {
struct bio_vec src_bv, dst_bv; struct bio_vec src_bv, dst_bv;
void *src_p, *dst_p; void *src_p, *dst_p;
unsigned bytes; unsigned bytes;
while (1) { while (src_iter->bi_size && dst_iter->bi_size) {
if (!src_iter.bi_size) { src_bv = bio_iter_iovec(src, *src_iter);
src = src->bi_next; dst_bv = bio_iter_iovec(dst, *dst_iter);
if (!src)
break;
src_iter = src->bi_iter;
}
if (!dst_iter.bi_size) {
dst = dst->bi_next;
if (!dst)
break;
dst_iter = dst->bi_iter;
}
src_bv = bio_iter_iovec(src, src_iter);
dst_bv = bio_iter_iovec(dst, dst_iter);
bytes = min(src_bv.bv_len, dst_bv.bv_len); bytes = min(src_bv.bv_len, dst_bv.bv_len);
@ -60,15 +44,27 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
kunmap_atomic(dst_p); kunmap_atomic(dst_p);
kunmap_atomic(src_p); kunmap_atomic(src_p);
bio_advance_iter(src, &src_iter, bytes); flush_dcache_page(dst_bv.bv_page);
bio_advance_iter(dst, &dst_iter, bytes);
bio_advance_iter(src, src_iter, bytes);
bio_advance_iter(dst, dst_iter, bytes);
} }
} }
/**
* bio_copy_data - copy contents of data buffers from one bio to another
* @src: source bio
* @dst: destination bio
*
* Stops when it reaches the end of either @src or @dst - that is, copies
* min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
*/
void bio_copy_data(struct bio *dst, struct bio *src) void bio_copy_data(struct bio *dst, struct bio *src)
{ {
bio_copy_data_iter(dst, dst->bi_iter, struct bvec_iter src_iter = src->bi_iter;
src, src->bi_iter); struct bvec_iter dst_iter = dst->bi_iter;
bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
} }
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)