diff --git a/.bcachefs_revision b/.bcachefs_revision index 3f01000e..89d60672 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -6afa1fcb13a8c66b1cafa08027f484a3f846c52d +70b5fb5dafe66482c0d09a37bd547f56ef645bc4 diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 0cde2638..5070caf8 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -340,7 +340,7 @@ btree_err: if (unlikely(ret)) goto err; - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 131d0f7b..fdf3a777 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -218,8 +218,8 @@ #define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else -#define bch2_fmt(_c, fmt) fmt "\n" -#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) +#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name) +#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum) #endif #define bch_info(c, fmt, ...) \ @@ -495,6 +495,7 @@ struct bch_dev { enum { /* startup: */ + BCH_FS_INITIALIZED, BCH_FS_ALLOC_READ_DONE, BCH_FS_ALLOC_CLEAN, BCH_FS_ALLOCATOR_RUNNING, diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 52212ad1..b115bd1f 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1443,7 +1443,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist * reflink: gates KEY_TYPE_reflink * inline_data: gates KEY_TYPE_inline_data - * new_siphash: gates BCH_STR_HASH_SIPHASH + * new_siphash: gates BCH_STR_HASH_siphash * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE */ #define BCH_SB_FEATURES() \ @@ -1519,12 +1519,17 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_STR_HASH_TYPES() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash_old, 2) \ + x(siphash, 3) + enum bch_str_hash_type { - BCH_STR_HASH_CRC32C = 0, - BCH_STR_HASH_CRC64 = 1, - BCH_STR_HASH_SIPHASH_OLD = 2, - BCH_STR_HASH_SIPHASH = 3, - BCH_STR_HASH_NR = 4, +#define x(t, n) BCH_STR_HASH_##t = n, + BCH_STR_HASH_TYPES() +#undef x + BCH_STR_HASH_NR }; #define BCH_STR_HASH_OPTS() \ @@ -1539,34 +1544,39 @@ enum bch_str_hash_opts { BCH_STR_HASH_OPT_NR }; +#define BCH_CSUM_TYPES() \ + x(none, 0) \ + x(crc32c_nonzero, 1) \ + x(crc64_nonzero, 2) \ + x(chacha20_poly1305_80, 3) \ + x(chacha20_poly1305_128, 4) \ + x(crc32c, 5) \ + x(crc64, 6) \ + x(xxhash, 7) + enum bch_csum_type { - BCH_CSUM_NONE = 0, - BCH_CSUM_CRC32C_NONZERO = 1, - BCH_CSUM_CRC64_NONZERO = 2, - BCH_CSUM_CHACHA20_POLY1305_80 = 3, - BCH_CSUM_CHACHA20_POLY1305_128 = 4, - BCH_CSUM_CRC32C = 5, - BCH_CSUM_CRC64 = 6, - BCH_CSUM_XXHASH = 7, - BCH_CSUM_NR = 8, +#define x(t, n) BCH_CSUM_##t = n, + BCH_CSUM_TYPES() +#undef x + BCH_CSUM_NR }; static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C_NONZERO] = 4, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64_NONZERO] = 8, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_XXHASH] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, + [BCH_CSUM_none] = 0, + [BCH_CSUM_crc32c_nonzero] = 4, + [BCH_CSUM_crc32c] = 4, + [BCH_CSUM_crc64_nonzero] = 8, + [BCH_CSUM_crc64] = 8, + [BCH_CSUM_xxhash] = 8, + [BCH_CSUM_chacha20_poly1305_80] = 10, + [BCH_CSUM_chacha20_poly1305_128] = 16, }; static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) { switch (type) { - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: return true; default: return false; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 905237a7..5ae61e5d 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -301,7 +301,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, list_for_each_entry_safe(b, t, &bc->freeable, list) { touched++; - if (freed >= nr) + if (touched >= nr) break; if (++i > 3 && @@ -316,7 +316,7 @@ restart: list_for_each_entry_safe(b, t, &bc->live, list) { touched++; - if (freed >= nr) { + if (touched >= nr) { /* Save position */ if (&t->list != &bc->live) list_move_tail(&bc->live, &t->list); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index f5d8a730..f43044e6 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -49,7 +49,7 @@ static inline int __btree_path_cmp(const struct btree_path *l, unsigned r_level) { return cmp_int(l->btree_id, r_btree_id) ?: - cmp_int(l->cached, r_cached) ?: + cmp_int((int) l->cached, (int) r_cached) ?: bpos_cmp(l->pos, r_pos) ?: -cmp_int(l->level, r_level); } @@ -760,6 +760,43 @@ out: return ret; } +void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) +{ + struct btree_path *path; + unsigned idx; + char buf[100]; + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: + cmp_int(path->cached, key_cache); + + if (cmp > 0) + break; + if (cmp < 0) + continue; + + if (!(path->nodes_locked & 1) || + !path->should_be_locked) + continue; + + if (!key_cache) { + if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && + bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) + return; + } else { + if (!bkey_cmp(pos, path->pos)) + return; + } + } + + bch2_dump_trans_paths_updates(trans); + panic("not locked: %s %s%s\n", + bch2_btree_ids[id], + (bch2_bpos_to_text(&PBUF(buf), pos), buf), + key_cache ? " cached" : ""); +} + #else static inline void bch2_btree_path_verify_level(struct btree_trans *trans, @@ -1647,19 +1684,19 @@ static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btr return NULL; } -static bool have_node_at_pos(struct btree_trans *trans, struct btree_path *path) +static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) { struct btree_path *next; next = prev_btree_path(trans, path); - if (next && path_l(next)->b == path_l(path)->b) - return true; + if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) + return next; next = next_btree_path(trans, path); - if (next && path_l(next)->b == path_l(path)->b) - return true; + if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) + return next; - return false; + return NULL; } static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) @@ -1686,11 +1723,20 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte (dup = have_path_at_pos(trans, path))) { dup->preserve = true; path->preserve = false; + goto free; } if (!path->preserve && - have_node_at_pos(trans, path)) - __bch2_path_free(trans, path); + (dup = have_node_at_pos(trans, path))) + goto free; + return; +free: + if (path->should_be_locked && + !btree_node_locked(dup, path->level)) + return; + + dup->should_be_locked |= path->should_be_locked; + __bch2_path_free(trans, path); } noinline __cold @@ -1704,11 +1750,13 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) btree_trans_verify_sorted(trans); trans_for_each_path_inorder(trans, path, idx) - printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n", + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", path->idx, path->ref, path->intent_ref, - path->preserve ? " preserve" : "", + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", bch2_btree_ids[path->btree_id], (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), + path->nodes_locked, #ifdef CONFIG_BCACHEFS_DEBUG (void *) path->ip_allocated #else diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 2dc58828..33a703c2 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -140,9 +140,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bke #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); void bch2_trans_verify_locks(struct btree_trans *); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, + struct bpos, bool); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) {} #endif void bch2_btree_path_fix_key_modified(struct btree_trans *trans, @@ -227,8 +231,6 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos iter->k.p.offset = iter->pos.offset = new_pos.offset; iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; iter->k.size = 0; - if (iter->path->ref == 1) - iter->path->should_be_locked = false; } static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index affc0e68..0d0a719f 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -338,7 +338,8 @@ struct btree_insert_entry { enum btree_id btree_id:8; u8 level; bool cached:1; - bool trans_triggers_run:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; struct bkey_i *k; struct btree_path *path; unsigned long ip_allocated; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 51f65226..112ac7ca 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -815,10 +815,112 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; + struct bkey unpacked; + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + bool trans_trigger_run; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->insert_trigger_run || + (i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + continue; + + BUG_ON(i->overwrite_trigger_run); + + i->insert_trigger_run = true; + trans_trigger_run = true; + + old = bch2_btree_path_peek_slot(i->path, &unpacked); + _deleted.p = i->path->pos; + + if (old.k->type == i->k->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); + } else { + ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|i->flags); + } + + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, + i->btree_id, &i->path->pos); + if (ret) + return ret; + } + } while (trans_trigger_run); + + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->overwrite_trigger_run || + (i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + continue; + + BUG_ON(!i->insert_trigger_run); + + i->overwrite_trigger_run = true; + trans_trigger_run = true; + + old = bch2_btree_path_peek_slot(i->path, &unpacked); + _deleted.p = i->path->pos; + + ret = bch2_trans_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|i->flags); + + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, + i->btree_id, &i->path->pos); + if (ret) + return ret; + } + } while (trans_trigger_run); + } + + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); + + return 0; +} + int __bch2_trans_commit(struct btree_trans *trans) { struct btree_insert_entry *i = NULL; - bool trans_trigger_run; unsigned u64s; int ret = 0; @@ -853,30 +955,9 @@ int __bch2_trans_commit(struct btree_trans *trans) i->btree_id, i->k->k.p); #endif - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - trans_for_each_update(trans, i) { - if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - !i->trans_triggers_run) { - i->trans_triggers_run = true; - trans_trigger_run = true; - - ret = bch2_trans_mark_update(trans, i->path, - i->k, i->flags); - if (unlikely(ret)) { - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip, _RET_IP_, - i->btree_id, &i->path->pos); - goto out; - } - } - } - } while (trans_trigger_run); + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out; trans_for_each_update(trans, i) { BUG_ON(!i->path->should_be_locked); @@ -1285,7 +1366,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, if (i < trans->updates + trans->nr_updates && !btree_insert_entry_cmp(&n, i)) { - BUG_ON(i->trans_triggers_run); + BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); /* * This is a hack to ensure that inode creates update the btree, diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index c3542d3c..6fc93b56 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -117,6 +117,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, unsigned journal_seq, bool gc) { + BUG_ON(!gc && !journal_seq); + return this_cpu_ptr(gc ? ca->usage_gc : ca->usage[journal_seq & JOURNAL_BUF_MASK]); @@ -142,6 +144,8 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, unsigned journal_seq, bool gc) { + BUG_ON(!gc && !journal_seq); + return this_cpu_ptr(gc ? c->usage_gc : c->usage[journal_seq & JOURNAL_BUF_MASK]); @@ -360,6 +364,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; + /* + * Hack for bch2_fs_initialize path, where we're first marking sb and + * journal non-transactionally: + */ + if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) + journal_seq = 1; + percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); @@ -1866,41 +1877,6 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, } } -int bch2_trans_mark_update(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *new, - unsigned flags) -{ - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - struct bkey_s_c old; - struct bkey unpacked; - int ret; - - _deleted.p = path->pos; - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc(path->btree_id)) - return 0; - - old = bch2_btree_path_peek_slot(path, &unpacked); - - if (old.k->type == new->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_trans_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); - } - - return ret; -} - static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 8a9b2b56..5ed9441c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -233,8 +233,6 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *, int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_trans_mark_update(struct btree_trans *, struct btree_path *, - struct bkey_i *, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index d20924e5..fbe8603c 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -35,18 +35,18 @@ struct bch2_checksum_state { static void bch2_checksum_init(struct bch2_checksum_state *state) { switch (state->type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: state->seed = 0; break; - case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_crc32c_nonzero: state->seed = U32_MAX; break; - case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_crc64_nonzero: state->seed = U64_MAX; break; - case BCH_CSUM_XXHASH: + case BCH_CSUM_xxhash: xxh64_reset(&state->h64state, 0); break; default: @@ -57,15 +57,15 @@ static void bch2_checksum_init(struct bch2_checksum_state *state) static u64 bch2_checksum_final(const struct bch2_checksum_state *state) { switch (state->type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: return state->seed; - case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_crc32c_nonzero: return state->seed ^ U32_MAX; - case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_crc64_nonzero: return state->seed ^ U64_MAX; - case BCH_CSUM_XXHASH: + case BCH_CSUM_xxhash: return xxh64_digest(&state->h64state); default: BUG(); @@ -75,17 +75,17 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state) static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) { switch (state->type) { - case BCH_CSUM_NONE: + case BCH_CSUM_none: return; - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC32C: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc32c: state->seed = crc32c(state->seed, data, len); break; - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC64: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc64: state->seed = crc64_be(state->seed, data, len); break; - case BCH_CSUM_XXHASH: + case BCH_CSUM_xxhash: xxh64_update(&state->h64state, data, len); break; default: @@ -161,12 +161,12 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, struct nonce nonce, const void *data, size_t len) { switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC32C: - case BCH_CSUM_XXHASH: - case BCH_CSUM_CRC64: { + case BCH_CSUM_none: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { struct bch2_checksum_state state; state.type = type; @@ -177,8 +177,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; @@ -212,13 +212,13 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, struct bio_vec bv; switch (type) { - case BCH_CSUM_NONE: + case BCH_CSUM_none: return (struct bch_csum) { 0 }; - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC32C: - case BCH_CSUM_XXHASH: - case BCH_CSUM_CRC64: { + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { struct bch2_checksum_state state; state.type = type; @@ -238,8 +238,8 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 6841fb16..f5c1a609 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -13,9 +13,9 @@ static inline bool bch2_checksum_mergeable(unsigned type) { switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: return true; default: return false; @@ -78,13 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, { switch (type) { case BCH_CSUM_OPT_none: - return BCH_CSUM_NONE; + return BCH_CSUM_none; case BCH_CSUM_OPT_crc32c: - return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; + return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; case BCH_CSUM_OPT_crc64: - return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; + return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; case BCH_CSUM_OPT_xxhash: - return BCH_CSUM_XXHASH; + return BCH_CSUM_xxhash; default: BUG(); } @@ -95,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, { if (c->sb.encryption_type) return c->opts.wide_macs - ? BCH_CSUM_CHACHA20_POLY1305_128 - : BCH_CSUM_CHACHA20_POLY1305_80; + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; return bch2_csum_opt_to_type(opt, true); } @@ -104,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) - return BCH_CSUM_CHACHA20_POLY1305_128; + return BCH_CSUM_chacha20_poly1305_128; return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 9267eea8..4dfcc955 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -197,8 +197,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } -static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, - struct bkey_s_c_dirent d, subvol_inum *target) +int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) { struct bch_subvolume s; int ret = 0; @@ -418,16 +418,15 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans, k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } + if (ret) + goto err; d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, dir, d, inum); if (ret > 0) ret = -ENOENT; +err: if (ret) bch2_trans_iter_exit(trans, iter); @@ -448,10 +447,10 @@ retry: ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, name, inum, 0); - - bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; + if (!ret) + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 8ae40776..1bb4d802 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -29,6 +29,9 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } +int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index bfa512d7..bca1b8a7 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1154,7 +1154,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.nr_blocks = nr_data + nr_parity; s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); - s->v.csum_type = BCH_CSUM_CRC32C; + s->v.csum_type = BCH_CSUM_crc32c; s->v.pad = 0; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 194fbe21..89b5be90 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -969,12 +969,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); break; case BCH_EXTENT_ENTRY_stripe_ptr: ec = &entry->stripe_ptr; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index a2367b3e..d3d48a5b 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1172,16 +1172,16 @@ static int __bch2_writepage(struct page *page, do_io: s = bch2_page_state_create(page, __GFP_NOFAIL); - ret = bch2_get_page_disk_reservation(c, inode, page, true); - if (ret) { - SetPageError(page); - mapping_set_error(page->mapping, ret); - unlock_page(page); - return 0; - } + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_page_disk_reservation(c, inode, page, false); + BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); orig = *s; + spin_unlock(&s->lock); for (i = 0; i < PAGE_SECTORS; i++) { if (s->s[i].state < SECTOR_DIRTY) @@ -1214,7 +1214,7 @@ do_io: offset = 0; while (1) { - unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; u64 sector; while (offset < PAGE_SECTORS && @@ -1224,16 +1224,15 @@ do_io: if (offset == PAGE_SECTORS) break; - sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; - while (offset + sectors < PAGE_SECTORS && - orig.s[offset + sectors].state >= SECTOR_DIRTY) + orig.s[offset + sectors].state >= SECTOR_DIRTY) { + reserved_sectors += orig.s[offset + sectors].replicas_reserved; + dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; sectors++; - - for (i = offset; i < offset + sectors; i++) { - reserved_sectors += orig.s[i].replicas_reserved; - dirty_sectors += orig.s[i].state == SECTOR_DIRTY; } + BUG_ON(!sectors); + + sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || @@ -2189,12 +2188,13 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2 = 0; + int ret, ret2, ret3; ret = file_write_and_wait_range(file, start, end); - ret2 = bch2_flush_inode(c, inode_inum(inode)); + ret2 = sync_inode_metadata(&inode->v, 1); + ret3 = bch2_flush_inode(c, inode_inum(inode)); - return ret ?: ret2; + return ret ?: ret2 ?: ret3; } /* truncate: */ @@ -2299,6 +2299,14 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, s->s[i].state = SECTOR_UNALLOCATED; } + /* + * Caller needs to know whether this page will be written out by + * writeback - doing an i_size update if necessary - or whether it will + * be responsible for the i_size update: + */ + ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), + PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; + zero_user_segment(page, start_offset, end_offset); /* @@ -2307,8 +2315,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: because we aren't currently tracking whether the page has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - ret = bch2_get_page_disk_reservation(c, inode, page, false); - BUG_ON(ret); + BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); /* * This removes any writeable userspace mappings; we need to force @@ -2330,6 +2337,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, round_up(from, PAGE_SIZE)); } +static int bch2_truncate_pages(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, + start, end); + + if (ret >= 0 && + start >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_page(inode, + end >> PAGE_SHIFT, + start, end); + return ret; +} + static int bch2_extend(struct user_namespace *mnt_userns, struct bch_inode_info *inode, struct bch_inode_unpacked *inode_u, @@ -2420,7 +2441,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, iattr->ia_valid &= ~ATTR_SIZE; ret = bch2_truncate_page(inode, iattr->ia_size); - if (unlikely(ret)) + if (unlikely(ret < 0)) goto err; /* @@ -2486,48 +2507,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode, static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 discard_start = round_up(offset, block_bytes(c)) >> 9; - u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; + u64 end = offset + len; + u64 block_start = round_up(offset, block_bytes(c)); + u64 block_end = round_down(end, block_bytes(c)); + bool truncated_last_page; int ret = 0; - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) + ret = bch2_truncate_pages(inode, offset, end); + if (unlikely(ret < 0)) goto err; - if (offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) { - ret = __bch2_truncate_page(inode, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto err; - } + truncated_last_page = ret; - truncate_pagecache_range(&inode->v, offset, offset + len - 1); + truncate_pagecache_range(&inode->v, offset, end - 1); - if (discard_start < discard_end) { + if (block_start < block_end ) { s64 i_sectors_delta = 0; ret = bch2_fpunch(c, inode_inum(inode), - discard_start, discard_end, + block_start >> 9, block_end >> 9, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); } mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME) ?: ret; + if (end >= inode->v.i_size && !truncated_last_page) { + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + } else { + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + } mutex_unlock(&inode->ei_update_lock); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); - return ret; } @@ -2547,31 +2559,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - /* - * We need i_mutex to keep the page cache consistent with the extents - * btree, and the btree consistent with i_size - we don't need outside - * locking for the extents btree itself, because we're using linked - * iterators - */ - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - if (insert) { - ret = -EFBIG; if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) - goto err; + return -EFBIG; - ret = -EINVAL; if (offset >= inode->v.i_size) - goto err; + return -EINVAL; src_start = U64_MAX; shift = len; } else { - ret = -EINVAL; if (offset + len >= inode->v.i_size) - goto err; + return -EINVAL; src_start = offset + len; shift = -len; @@ -2581,7 +2580,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) - goto err; + return ret; if (insert) { i_size_write(&inode->v, new_size); @@ -2598,7 +2597,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, i_sectors_acct(c, inode, NULL, i_sectors_delta); if (ret) - goto err; + return ret; } bch2_bkey_buf_init(©); @@ -2711,18 +2710,19 @@ reassemble: bch2_bkey_buf_exit(©, c); if (ret) - goto err; + return ret; + mutex_lock(&inode->ei_update_lock); if (!insert) { i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode_size(c, inode, new_size, ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); } -err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -2817,6 +2817,17 @@ bkey_err: if (ret == -EINTR) ret = 0; } + + if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + + bch2_fpunch_at(&trans, &iter, inode_inum(inode), + end_sector, &i_sectors_delta); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_quota_reservation_put(c, inode, "a_res); + } + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -2825,77 +2836,58 @@ bkey_err: static long bchfs_fallocate(struct bch_inode_info *inode, int mode, loff_t offset, loff_t len) { - struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - loff_t end = offset + len; - loff_t block_start = round_down(offset, block_bytes(c)); - loff_t block_end = round_up(end, block_bytes(c)); - int ret; - - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + u64 end = offset + len; + u64 block_start = round_down(offset, block_bytes(c)); + u64 block_end = round_up(end, block_bytes(c)); + bool truncated_last_page = false; + int ret, ret2 = 0; if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ret = inode_newsize_ok(&inode->v, end); if (ret) - goto err; + return ret; } if (mode & FALLOC_FL_ZERO_RANGE) { - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, end); + ret = bch2_truncate_pages(inode, offset, end); + if (unlikely(ret < 0)) + return ret; - if (!ret && - offset >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_page(inode, - end >> PAGE_SHIFT, - offset, end); - - if (unlikely(ret)) - goto err; + truncated_last_page = ret; truncate_pagecache_range(&inode->v, offset, end - 1); + + block_start = round_up(offset, block_bytes(c)); + block_end = round_down(end, block_bytes(c)); } ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); - if (ret) - goto err; /* - * Do we need to extend the file? - * - * If we zeroed up to the end of the file, we dropped whatever writes - * were going to write out the current i_size, so we have to extend - * manually even if FL_KEEP_SIZE was set: + * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, + * so that the VFS cache i_size is consistent with the btree i_size: */ + if (ret && + !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) + return ret; + + if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) + end = inode->v.i_size; + if (end >= inode->v.i_size && - (!(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_ZERO_RANGE))) { - - /* - * Sync existing appends before extending i_size, - * as in bch2_extend(): - */ - ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); - if (ret) - goto err; - - if (mode & FALLOC_FL_KEEP_SIZE) - end = inode->v.i_size; - else - i_size_write(&inode->v, end); + (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || + !(mode & FALLOC_FL_KEEP_SIZE))) { + spin_lock(&inode->v.i_lock); + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, end, 0); + ret2 = bch2_write_inode_size(c, inode, end, 0); mutex_unlock(&inode->ei_update_lock); } -err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); - return ret; + + return ret ?: ret2; } long bch2_fallocate_dispatch(struct file *file, int mode, @@ -2908,6 +2900,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode, if (!percpu_ref_tryget(&c->writes)) return -EROFS; + inode_lock(&inode->v); + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ret = bchfs_fallocate(inode, mode, offset, len); else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) @@ -2919,6 +2915,9 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else ret = -EOPNOTSUPP; + + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); percpu_ref_put(&c->writes); return ret; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 74de7bc1..fc29e6c4 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -36,7 +36,7 @@ static struct kmem_cache *bch2_inode_cache; -static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *); @@ -92,11 +92,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock) __pagecache_lock_get(lock, -1); } -void bch2_inode_update_after_write(struct bch_fs *c, +void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { + struct bch_fs *c = trans->c; + + BUG_ON(bi->bi_inum != inode->v.i_ino); + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), + 0 && c->opts.inodes_use_key_cache); + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); @@ -125,6 +133,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, int ret; bch2_trans_init(&trans, c, 0, 512); + trans.ip = _RET_IP_; retry: bch2_trans_begin(&trans); @@ -139,7 +148,7 @@ retry: * this is important for inode updates via bchfs_write_index_update */ if (!ret) - bch2_inode_update_after_write(c, inode, &inode_u, fields); + bch2_inode_update_after_write(&trans, inode, &inode_u, fields); bch2_trans_iter_exit(&trans, &iter); @@ -214,6 +223,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; + struct btree_trans trans; int ret; inode = to_bch_ei(iget5_locked(c->vfs_sb, @@ -226,14 +236,19 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (!(inode->v.i_state & I_NEW)) return &inode->v; - ret = bch2_inode_find_by_inum(c, inum, &inode_u); + bch2_trans_init(&trans, c, 8, 0); + ret = lockrestart_do(&trans, + bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + + if (!ret) + bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + bch2_trans_exit(&trans); + if (ret) { iget_failed(&inode->v); return ERR_PTR(ret); } - bch2_vfs_inode_init(c, inum, inode, &inode_u); - unlock_new_inode(&inode->v); return &inode->v; @@ -305,7 +320,7 @@ err_before_quota: } if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&dir->ei_update_lock); } @@ -313,7 +328,8 @@ err_before_quota: inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; inum.inum = inode_u.bi_inum; - bch2_vfs_inode_init(c, inum, inode, &inode_u); + bch2_iget5_set(&inode->v, &inum); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -428,11 +444,9 @@ static int __bch2_link(struct bch_fs *c, &dentry->d_name)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); } bch2_trans_exit(&trans); @@ -480,11 +494,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, deleting_snapshot)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_MTIME); } @@ -612,18 +624,18 @@ static int bch2_rename2(struct user_namespace *mnt_userns, BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); - bch2_inode_update_after_write(c, src_dir, &src_dir_u, + bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); if (src_dir != dst_dir) - bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, + bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, src_inode, &src_inode_u, + bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, ATTR_CTIME); if (dst_inode) - bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, + bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: bch2_trans_exit(&trans); @@ -741,7 +753,7 @@ btree_err: if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); + bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -1110,53 +1122,237 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; -#if 0 +struct bcachefs_fid { + u64 inum; + u32 subvol; + u32 gen; +} __packed; + +struct bcachefs_fid_with_parent { + struct bcachefs_fid fid; + struct bcachefs_fid dir; +} __packed; + +static int bcachefs_fid_valid(int fh_len, int fh_type) +{ + switch (fh_type) { + case FILEID_BCACHEFS_WITHOUT_PARENT: + return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); + case FILEID_BCACHEFS_WITH_PARENT: + return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); + default: + return false; + } +} + +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) +{ + return (struct bcachefs_fid) { + .inum = inode->ei_inode.bi_inum, + .subvol = inode->ei_subvol, + .gen = inode->ei_inode.bi_generation, + }; +} + +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, + struct inode *vdir) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *dir = to_bch_ei(vdir); + + if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) + return FILEID_INVALID; + + if (!S_ISDIR(inode->v.i_mode) && dir) { + struct bcachefs_fid_with_parent *fid = (void *) fh; + + fid->fid = bch2_inode_to_fid(inode); + fid->dir = bch2_inode_to_fid(dir); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITH_PARENT; + } else { + struct bcachefs_fid *fid = (void *) fh; + + *fid = bch2_inode_to_fid(inode); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITHOUT_PARENT; + } +} + static struct inode *bch2_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) + struct bcachefs_fid fid) { struct bch_fs *c = sb->s_fs_info; - struct inode *vinode; - - if (ino < BCACHEFS_ROOT_INO) - return ERR_PTR(-ESTALE); - - vinode = bch2_vfs_inode_get(c, ino); - if (IS_ERR(vinode)) - return ERR_CAST(vinode); - if (generation && vinode->i_generation != generation) { - /* we didn't find the right inode.. */ + struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { + .subvol = fid.subvol, + .inum = fid.inum, + }); + if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { iput(vinode); - return ERR_PTR(-ESTALE); + vinode = ERR_PTR(-ESTALE); } return vinode; } -static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - bch2_nfs_get_inode); + struct bcachefs_fid *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type)) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); } -static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - bch2_nfs_get_inode); + struct bcachefs_fid_with_parent *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type) || + fh_type != FILEID_BCACHEFS_WITH_PARENT) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); +} + +static struct dentry *bch2_get_parent(struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + subvol_inum parent_inum = { + .subvol = inode->ei_inode.bi_parent_subvol ?: + inode->ei_subvol, + .inum = inode->ei_inode.bi_dir, + }; + + if (!parent_inum.inum) + return NULL; + + return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); +} + +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_inode_info *dir = to_bch_ei(parent->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter1; + struct btree_iter iter2; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked inode_u; + subvol_inum target; + u32 snapshot; + unsigned name_len; + int ret; + + if (!S_ISDIR(dir->v.i_mode)) + return -EINVAL; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); + bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter1, snapshot); + bch2_btree_iter_set_snapshot(&iter2, snapshot); + + ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); + if (ret) + goto err; + + if (inode_u.bi_dir == dir->ei_inode.bi_inum) { + bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + + k = bch2_btree_iter_peek_slot(&iter1); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_dirent) { + ret = -ENOENT; + goto err; + } + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } else { + /* + * File with multiple hardlinks and our backref is to the wrong + * directory - linear search: + */ + for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + if (k.k->p.inode > dir->ei_inode.bi_inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret < 0) + break; + if (ret) + continue; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } + } + + ret = -ENOENT; + goto err; +found: + name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); + + memcpy(name, d.v->d_name, name_len); + name[name_len] = '\0'; +err: + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter1); + bch2_trans_iter_exit(&trans, &iter2); + bch2_trans_exit(&trans); + + return ret; } -#endif static const struct export_operations bch_export_ops = { - //.fh_to_dentry = bch2_fh_to_dentry, - //.fh_to_parent = bch2_fh_to_parent, - //.get_parent = bch2_get_parent, + .encode_fh = bch2_encode_fh, + .fh_to_dentry = bch2_fh_to_dentry, + .fh_to_parent = bch2_fh_to_parent, + .get_parent = bch2_get_parent, + .get_name = bch2_get_name, }; -static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi) { - bch2_inode_update_after_write(c, inode, bi, ~0); + bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; @@ -1655,6 +1851,8 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif + sb->s_shrink.seeks = 0; + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); if (IS_ERR(vinode)) { bch_err(c, "error mounting: error getting root inode %i", diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 40212b3d..27aacd7e 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -173,7 +173,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); -void bch2_inode_update_after_write(struct bch_fs *, +void bch2_inode_update_after_write(struct btree_trans *, struct bch_inode_info *, struct bch_inode_unpacked *, unsigned); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index ef1866a7..ffce68a8 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -220,6 +220,7 @@ int bch2_inode_unpack(struct bkey_s_c k, struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= 0; unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); @@ -722,9 +723,9 @@ err: return ret; } -static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) { struct btree_iter iter; int ret; diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index d433d48d..723186d8 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -89,6 +89,8 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, struct bch_inode_unpacked *); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 7c9ea91d..3a6b4446 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -187,7 +187,6 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, int bch2_sum_sector_overwrites(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *new, - bool *maybe_extending, bool *usage_increasing, s64 *i_sectors_delta, s64 *disk_sectors_delta) @@ -199,7 +198,6 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; - *maybe_extending = true; *usage_increasing = false; *i_sectors_delta = 0; *disk_sectors_delta = 0; @@ -226,31 +224,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; - if (bkey_cmp(old.k->p, new->k.p) >= 0) { - /* - * Check if there's already data above where we're - * going to be writing to - this means we're definitely - * not extending the file: - * - * Note that it's not sufficient to check if there's - * data up to the sector offset we're going to be - * writing to, because i_size could be up to one block - * less: - */ - if (!bkey_cmp(old.k->p, new->k.p)) { - old = bch2_btree_iter_next(&iter); - ret = bkey_err(old); - if (ret) - break; - } - - if (old.k && !bkey_err(old) && - old.k->p.inode == extent_iter->pos.inode && - bkey_extent_is_data(old.k)) - *maybe_extending = false; - + if (bkey_cmp(old.k->p, new->k.p) >= 0) break; - } } bch2_trans_iter_exit(trans, &iter); @@ -267,12 +242,10 @@ int bch2_extent_update(struct btree_trans *trans, s64 *i_sectors_delta_total, bool check_enospc) { - /* this must live until after bch2_trans_commit(): */ - struct bkey_inode_buf inode_p; struct btree_iter inode_iter; struct bch_inode_unpacked inode_u; struct bpos next_pos; - bool extending = false, usage_increasing; + bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; @@ -290,84 +263,51 @@ int bch2_extent_update(struct btree_trans *trans, if (ret) return ret; + new_i_size = min(k->k.p.offset << 9, new_i_size); + next_pos = k->k.p; + ret = bch2_sum_sector_overwrites(trans, iter, k, - &extending, &usage_increasing, &i_sectors_delta, &disk_sectors_delta); if (ret) return ret; - if (!usage_increasing) - check_enospc = false; - if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { ret = bch2_disk_reservation_add(trans->c, disk_res, disk_sectors_delta - disk_res->sectors, - !check_enospc + !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) return ret; } - new_i_size = extending - ? min(k->k.p.offset << 9, new_i_size) - : 0; - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT); if (ret) return ret; - /* - * XXX: - * writeback can race a bit with truncate, because truncate - * first updates the inode then truncates the pagecache. This is - * ugly, but lets us preserve the invariant that the in memory - * i_size is always >= the on disk i_size. - * - BUG_ON(new_i_size > inode_u.bi_size && - (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); - */ - BUG_ON(new_i_size > inode_u.bi_size && !extending); - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && new_i_size > inode_u.bi_size) inode_u.bi_size = new_i_size; - else - new_i_size = 0; inode_u.bi_sectors += i_sectors_delta; - if (i_sectors_delta || new_i_size) { - bch2_inode_pack(trans->c, &inode_p, &inode_u); - - inode_p.inode.k.p.snapshot = iter->snapshot; - - ret = bch2_trans_update(trans, &inode_iter, - &inode_p.inode.k_i, 0); - } - + ret = bch2_trans_update(trans, iter, k, 0) ?: + bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); bch2_trans_iter_exit(trans, &inode_iter); if (ret) return ret; - next_pos = k->k.p; - - ret = bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, journal_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL); - BUG_ON(ret == -ENOSPC); - if (ret) - return ret; - - bch2_btree_iter_set_pos(iter, next_pos); - if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; } @@ -385,26 +325,31 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, int ret = 0, ret2 = 0; u32 snapshot; - while (1) { + while (!ret || ret == -EINTR) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + if (ret) + ret2 = ret; + bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - goto btree_err; + continue; bch2_btree_iter_set_snapshot(iter, snapshot); k = bch2_btree_iter_peek(iter); - if (bkey_cmp(iter->pos, end_pos) >= 0) + if (bkey_cmp(iter->pos, end_pos) >= 0) { + bch2_btree_iter_set_pos(iter, end_pos); break; + } ret = bkey_err(k); if (ret) - goto btree_err; + continue; bkey_init(&delete.k); delete.k.p = iter->pos; @@ -417,18 +362,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, &disk_res, NULL, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); -btree_err: - if (ret == -EINTR) { - ret2 = ret; - ret = 0; - } - if (ret) - break; } - if (bkey_cmp(iter->pos, end_pos) > 0) - bch2_btree_iter_set_pos(iter, end_pos); - return ret ?: ret2; } @@ -2104,7 +2039,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_NONE && + (pick.crc.csum_type != BCH_CSUM_none && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || diff --git a/libbcachefs/io.h b/libbcachefs/io.h index fbe46660..1aa422dc 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -56,7 +56,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) } int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, bool *, s64 *, s64 *); + struct bkey_i *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, subvol_inum, struct btree_iter *, struct bkey_i *, struct disk_reservation *, u64 *, u64, s64 *, bool); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index a2b26d5b..14bea8a2 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -107,7 +107,12 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - j->err_seq = journal_cur_seq(j); + /* + * XXX: we're not using j->lock here because this can be called from + * interrupt context, this can race with journal_write_done() + */ + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); } @@ -551,7 +556,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); + if (WARN_ONCE(seq > journal_cur_seq(j), + "requested to flush journal seq %llu, but currently at %llu", + seq, journal_cur_seq(j))) + goto out; /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 53aad1d0..5c8304e0 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1259,14 +1259,15 @@ static void journal_write_done(struct closure *cl) if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; - j->seq_ondisk = seq; - if (err && (!j->err_seq || seq < j->err_seq)) - j->err_seq = seq; + if (!err) { + j->seq_ondisk = seq; - if (!JSET_NO_FLUSH(w->data)) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - } + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index a93f5b18..ca482c67 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -646,6 +646,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (fifo_free(&j->pin) <= 32) min_nr = 1; + if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + min_nr = 1; + trace_journal_reclaim_start(c, min_nr, j->prereserved.reserved, diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 0152fbcd..64e39c10 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -160,7 +160,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct extent_ptr_decoded p; struct bpos next_pos; bool did_work = false; - bool extending = false, should_check_enospc; + bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; bch2_trans_begin(&trans); @@ -226,7 +226,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.data_replicas); ret = bch2_sum_sector_overwrites(&trans, &iter, insert, - &extending, &should_check_enospc, &i_sectors_delta, &disk_sectors_delta); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index ff99c6d2..a955ef20 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = { NULL }; +const char * const bch2_csum_types[] = { + BCH_CSUM_TYPES() + NULL +}; + const char * const bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; +const char * const bch2_compression_types[] = { + BCH_COMPRESSION_TYPES() + NULL +}; + const char * const bch2_compression_opts[] = { BCH_COMPRESSION_OPTS() NULL }; const char * const bch2_str_hash_types[] = { + BCH_STR_HASH_TYPES() + NULL +}; + +const char * const bch2_str_hash_opts[] = { BCH_STR_HASH_OPTS() NULL }; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index b60bdfca..5d9c00af 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -12,9 +12,12 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const bch2_btree_ids[]; +extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_str_hash_opts[]; extern const char * const bch2_data_types[]; extern const char * const bch2_cache_replacement_policies[]; extern const char * const bch2_member_states[]; @@ -140,7 +143,7 @@ enum opt_type { NULL, NULL) \ x(str_hash, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_types), \ + OPT_STR(bch2_str_hash_opts), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index d8e511a0..c3b4d116 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -518,57 +518,38 @@ static void replay_now_at(struct journal *j, u64 seq) } static int __bch2_journal_replay_key(struct btree_trans *trans, - enum btree_id id, unsigned level, - struct bkey_i *k) + struct journal_key *k) { struct btree_iter iter; + unsigned iter_flags = + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS; int ret; - bch2_trans_node_iter_init(trans, &iter, id, k->k.p, - BTREE_MAX_DEPTH, level, - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS); + if (!k->level && k->btree_id == BTREE_ID_alloc) + iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; + + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) { - unsigned commit_flags = BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW; + unsigned commit_flags = + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RESERVED; if (!k->allocated) commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; return bch2_trans_do(c, NULL, NULL, commit_flags, - __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); -} - -static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - return bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY, - __bch2_alloc_replay_key(&trans, k)); + __bch2_journal_replay_key(&trans, k)); } static int journal_sort_seq_cmp(const void *_l, const void *_r) @@ -606,7 +587,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (!i->level && i->btree_id == BTREE_ID_alloc) { j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_alloc_replay_key(c, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -1050,6 +1031,8 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->sb.clean) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); + else + bch_info(c, "recovering from unclean shutdown"); if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); @@ -1068,7 +1051,6 @@ int bch2_fs_recovery(struct bch_fs *c) bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); ret = -EINVAL; goto err; - } if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { @@ -1498,7 +1480,7 @@ int bch2_fs_initialize(struct bch_fs *c) } err = "error writing first journal entry"; - ret = bch2_journal_meta(&c->journal); + ret = bch2_journal_flush(&c->journal); if (ret) goto err; diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index d003f408..8dcac781 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -44,7 +44,10 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); + pr_buf(out, "idx %llu front_pad %u back_pad %u", + le64_to_cpu(p.v->idx), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); } bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) @@ -347,7 +350,8 @@ s64 bch2_remap_range(struct bch_fs *c, inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, 0); + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); } bch2_trans_iter_exit(&trans, &inode_iter); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 789dde7c..57d63674 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -20,13 +20,13 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { switch (opt) { case BCH_STR_HASH_OPT_crc32c: - return BCH_STR_HASH_CRC32C; + return BCH_STR_HASH_crc32c; case BCH_STR_HASH_OPT_crc64: - return BCH_STR_HASH_CRC64; + return BCH_STR_HASH_crc64; case BCH_STR_HASH_OPT_siphash: return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) - ? BCH_STR_HASH_SIPHASH - : BCH_STR_HASH_SIPHASH_OLD; + ? BCH_STR_HASH_siphash + : BCH_STR_HASH_siphash_old; default: BUG(); } @@ -51,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) .siphash_key = { .k0 = bi->bi_hash_seed } }; - if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { + if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; @@ -77,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, const struct bch_hash_info *info) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: ctx->crc32c = crc32c(~0, &info->siphash_key.k0, sizeof(info->siphash_key.k0)); break; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, sizeof(info->siphash_key.k0)); break; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: SipHash24_Init(&ctx->siphash, &info->siphash_key); break; default: @@ -99,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, const void *data, size_t len) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: ctx->crc32c = crc32c(ctx->crc32c, data, len); break; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: ctx->crc64 = crc64_be(ctx->crc64, data, len); break; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: SipHash24_Update(&ctx->siphash, data, len); break; default: @@ -118,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, const struct bch_hash_info *info) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: return ctx->crc32c; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: return ctx->crc64 >> 1; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: return SipHash24_End(&ctx->siphash) >> 1; default: BUG(); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 33d832bc..80297633 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -441,8 +441,16 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) set_bit(BCH_FS_ERROR, &c->flags); + else + clear_bit(BCH_FS_ERROR, &c->flags); + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); + else + clear_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); + + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) + set_bit(BCH_FS_INITIALIZED, &c->flags); ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret)