From 76a549d82d1383c02e4aa6f7d9eda2df9f2196b3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jul 2018 03:58:54 -0400 Subject: [PATCH] Update bcachefs sources to eab3b355cf bcachefs: trace transaction restarts --- .bcachefs_revision | 2 +- cmd_migrate.c | 5 +- libbcachefs/acl.c | 149 +++-- libbcachefs/acl.h | 27 +- libbcachefs/bcachefs.h | 7 +- libbcachefs/bcachefs_format.h | 7 +- libbcachefs/bkey.h | 8 +- libbcachefs/bset.c | 2 +- libbcachefs/btree_cache.c | 1 + libbcachefs/btree_io.c | 5 +- libbcachefs/btree_iter.c | 283 ++++++++- libbcachefs/btree_iter.h | 64 +++ libbcachefs/btree_types.h | 34 ++ libbcachefs/btree_update.h | 38 +- libbcachefs/btree_update_leaf.c | 99 +++- libbcachefs/dirent.c | 284 +++++----- libbcachefs/dirent.h | 12 +- libbcachefs/error.c | 5 +- libbcachefs/error.h | 6 + libbcachefs/fs-io.c | 93 +-- libbcachefs/fs-ioctl.c | 25 +- libbcachefs/fs.c | 976 +++++++++++++++++++------------- libbcachefs/fs.h | 10 +- libbcachefs/fsck.c | 526 +++++++++++------ libbcachefs/fsck.h | 2 +- libbcachefs/inode.c | 111 ++-- libbcachefs/inode.h | 5 + libbcachefs/recovery.c | 65 ++- libbcachefs/str_hash.h | 323 ++++------- libbcachefs/util.c | 2 +- libbcachefs/xattr.c | 94 +-- libbcachefs/xattr.h | 9 +- 32 files changed, 2105 insertions(+), 1174 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index f1807172..dddb0443 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -940d6ca657ea70758f3f43323bfd531019a40d3c +eab3b355cf6fcabbf07d7a9032c68e95cab37ad0 diff --git a/cmd_migrate.c b/cmd_migrate.c index 61866534..44283c3c 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -239,8 +239,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, const struct xattr_handler *h = xattr_resolve_name(&attr); - int ret = bch2_xattr_set(c, dst->bi_inum, &hash_info, attr, - val, val_size, 0, h->flags, NULL); + int ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, + bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr, + val, val_size, h->flags, 0)); if (ret < 0) die("error creating xattr: %s", strerror(-ret)); } diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index a8735bc0..534ea94e 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -132,7 +132,8 @@ invalid: * Convert from in-memory to filesystem representation. */ static struct bkey_i_xattr * -bch2_acl_to_xattr(const struct posix_acl *acl, +bch2_acl_to_xattr(struct btree_trans *trans, + const struct posix_acl *acl, int type) { struct bkey_i_xattr *xattr; @@ -164,7 +165,7 @@ bch2_acl_to_xattr(const struct posix_acl *acl, if (u64s > U8_MAX) return ERR_PTR(-E2BIG); - xattr = kmalloc(u64s * sizeof(u64), GFP_KERNEL); + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); if (IS_ERR(xattr)) return xattr; @@ -214,20 +215,29 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c_xattr xattr; - struct bkey_s_c k; struct posix_acl *acl = NULL; - int name_index = acl_to_xattr_type(type); - k = bch2_xattr_get_iter(c, &iter, inode, "", name_index); - if (IS_ERR(k.k)) { - if (PTR_ERR(k.k) != -ENOENT) - acl = ERR_CAST(k.k); + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); + + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, + &inode->ei_str_hash, inode->v.i_ino, + &X_SEARCH(acl_to_xattr_type(type), "", 0), + 0); + if (IS_ERR(iter)) { + if (PTR_ERR(iter) == -EINTR) + goto retry; + + if (PTR_ERR(iter) != -ENOENT) + acl = ERR_CAST(iter); goto out; } - xattr = bkey_s_c_to_xattr(k); + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); acl = bch2_acl_from_disk(xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); @@ -235,49 +245,59 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type) if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); out: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return acl; } -int __bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) +int bch2_set_acl_trans(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + struct posix_acl *acl, int type) { - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret; if (type == ACL_TYPE_DEFAULT && - !S_ISDIR(inode->v.i_mode)) + !S_ISDIR(inode_u->bi_mode)) return acl ? -EACCES : 0; if (acl) { struct bkey_i_xattr *xattr = - bch2_acl_to_xattr(acl, type); + bch2_acl_to_xattr(trans, acl, type); if (IS_ERR(xattr)) return PTR_ERR(xattr); - ret = bch2_hash_set(bch2_xattr_hash_desc, &inode->ei_str_hash, - c, inode->v.i_ino, &inode->ei_journal_seq, - &xattr->k_i, 0); - kfree(xattr); + ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, + inode_u->bi_inum, &xattr->k_i, 0); } else { struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - ret = bch2_hash_delete(bch2_xattr_hash_desc, &inode->ei_str_hash, - c, inode->v.i_ino, &inode->ei_journal_seq, - &search); + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, + inode_u->bi_inum, &search); } - if (!ret) - set_cached_acl(&inode->v, type, acl); + return ret == -ENOENT ? 0 : ret; +} - return ret; +static int inode_update_for_set_acl_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct timespec now = current_time(&inode->v); + umode_t mode = (unsigned long) p; + + bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_mode = mode; + return 0; } int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct bch_inode_unpacked inode_u; umode_t mode = inode->v.i_mode; int ret; @@ -287,20 +307,77 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) return ret; } - ret = __bch2_set_acl(vinode, acl, type); - if (ret) - return ret; + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); - if (mode != inode->v.i_mode) { - mutex_lock(&inode->ei_update_lock); - inode->v.i_mode = mode; - inode->v.i_ctime = current_time(&inode->v); + ret = bch2_set_acl_trans(&trans, + &inode->ei_inode, + &inode->ei_str_hash, + acl, type) ?: + bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_set_acl_fn, + (void *)(unsigned long) mode) ?: + bch2_trans_commit(&trans, NULL, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) + goto err; - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); - } + bch2_inode_update_after_write(c, inode, &inode_u, + ATTR_CTIME|ATTR_MODE); + + set_cached_acl(&inode->v, type, acl); +err: + bch2_trans_exit(&trans); return ret; } +int bch2_acl_chmod(struct btree_trans *trans, + struct bch_inode_info *inode, + umode_t mode, + struct posix_acl **new_acl) +{ + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl; + int ret = 0; + + iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, + &inode->ei_str_hash, inode->v.i_ino, + &X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR(acl); + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) + goto err; + + new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto err; + } + + bch2_trans_update(trans, iter, &new->k_i, 0); + *new_acl = acl; + acl = NULL; +err: + kfree(acl); + return ret; +} + #endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index 0be31ee9..e0672430 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -1,6 +1,11 @@ #ifndef _BCACHEFS_ACL_H #define _BCACHEFS_ACL_H +struct bch_inode_unpacked; +struct bch_hash_info; +struct bch_inode_info; +struct posix_acl; + #ifdef CONFIG_BCACHEFS_POSIX_ACL #define BCH_ACL_VERSION 0x0001 @@ -20,20 +25,30 @@ typedef struct { __le32 a_version; } bch_acl_header; -struct posix_acl; +struct posix_acl *bch2_get_acl(struct inode *, int); -extern struct posix_acl *bch2_get_acl(struct inode *, int); -extern int __bch2_set_acl(struct inode *, struct posix_acl *, int); -extern int bch2_set_acl(struct inode *, struct posix_acl *, int); +int bch2_set_acl_trans(struct btree_trans *, + struct bch_inode_unpacked *, + const struct bch_hash_info *, + struct posix_acl *, int); +int bch2_set_acl(struct inode *, struct posix_acl *, int); +int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, + umode_t, struct posix_acl **); #else -static inline int __bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static inline int bch2_set_acl_trans(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + struct posix_acl *acl, int type) { return 0; } -static inline int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static inline int bch2_acl_chmod(struct btree_trans *trans, + struct bch_inode_info *inode, + umode_t mode, + struct posix_acl **new_acl) { return 0; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 1482b80a..bd5ea6fc 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -262,7 +262,11 @@ do { \ BCH_DEBUG_PARAM(journal_seq_verify, \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(inject_invalid_keys, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -465,6 +469,7 @@ enum { /* misc: */ BCH_FS_BDEV_MOUNTED, BCH_FS_FSCK_FIXED_ERRORS, + BCH_FS_FSCK_UNFIXED_ERRORS, BCH_FS_FIXED_GENS, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index b6e7b983..e300738d 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -722,9 +722,7 @@ enum { __BCH_INODE_I_SIZE_DIRTY= 5, __BCH_INODE_I_SECTORS_DIRTY= 6, - - /* not implemented yet: */ - __BCH_INODE_HAS_XATTRS = 7, /* has xattrs in xattr btree */ + __BCH_INODE_UNLINKED = 7, /* bits 20+ reserved for packed fields below: */ }; @@ -736,7 +734,7 @@ enum { #define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -#define BCH_INODE_HAS_XATTRS (1 << __BCH_INODE_HAS_XATTRS) +#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); @@ -1222,6 +1220,7 @@ enum bch_sb_features { BCH_FEATURE_LZ4 = 0, BCH_FEATURE_GZIP = 1, BCH_FEATURE_ZSTD = 2, + BCH_FEATURE_ATOMIC_NLINK = 3, }; /* options: */ diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 2f62bd8e..bd1d21b0 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -206,14 +206,12 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); static __always_inline int bversion_cmp(struct bversion l, struct bversion r) { - if (l.hi != r.hi) - return l.hi < r.hi ? -1 : 1; - if (l.lo != r.lo) - return l.lo < r.lo ? -1 : 1; - return 0; + return (l.hi > r.hi) - (l.hi < r.hi) ?: + (l.lo > r.lo) - (l.lo < r.lo); } #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) +#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) static __always_inline int bversion_zero(struct bversion v) { diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 5c777872..8c77fc50 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1449,7 +1449,7 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater)) m = bkey_next(m); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + if (btree_keys_expensive_checks(b)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index f15a415e..db3712a8 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -730,6 +730,7 @@ retry: if (bch2_btree_node_relock(iter, level + 1)) goto retry; + trans_restart(); return ERR_PTR(-EINTR); } } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 847dfd68..94f56dbb 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1298,7 +1298,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct bkey_s_c u = bkey_disassemble(b, k, &tmp); const char *invalid = bch2_bkey_val_invalid(c, type, u); - if (invalid) { + if (invalid || + (inject_invalid_keys(c) && + !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); @@ -1310,6 +1312,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); continue; } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 097b68e0..a52ec12e 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -262,6 +262,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, if (ret) __btree_node_lock_type(c, b, type); + else + trans_restart(); + return ret; } @@ -1555,6 +1558,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter) for_each_linked_btree_iter(iter, linked) if (linked->next == iter) { linked->next = iter->next; + iter->next = iter; return; } @@ -1571,8 +1575,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { unsigned nr_iters = 0; - for_each_btree_iter(iter, new) - nr_iters++; + for_each_btree_iter(new, iter) + if (iter->btree_id == new->btree_id) + nr_iters++; BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE); } @@ -1580,8 +1585,278 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) { + unsigned i; + __bch2_btree_iter_unlock(dst); memcpy(dst, src, offsetof(struct btree_iter, next)); - dst->nodes_locked = dst->nodes_intent_locked = 0; - dst->uptodate = BTREE_ITER_NEED_RELOCK; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) + six_lock_increment(&dst->l[i].b->lock, + __btree_lock_want(dst, i)); +} + +/* new transactional stuff: */ + +static void btree_trans_verify(struct btree_trans *trans) +{ + unsigned i; + + for (i = 0; i < trans->nr_iters; i++) { + struct btree_iter *iter = &trans->iters[i]; + + BUG_ON(btree_iter_linked(iter) != + ((trans->iters_linked & (1 << i)) && + !is_power_of_2(trans->iters_linked))); + } +} + +void bch2_trans_iter_free(struct btree_trans *trans, + struct btree_iter *iter) +{ + unsigned idx; + + for (idx = 0; idx < trans->nr_iters; idx++) + if (&trans->iters[idx] == iter) + goto found; + BUG(); +found: + BUG_ON(!(trans->iters_linked & (1U << idx))); + + trans->iters_live &= ~(1U << idx); + trans->iters_linked &= ~(1U << idx); + bch2_btree_iter_unlink(iter); +} + +static int btree_trans_realloc_iters(struct btree_trans *trans) +{ + struct btree_iter *new_iters; + unsigned i; + + bch2_trans_unlock(trans); + + new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX, + GFP_NOFS); + if (!new_iters) + return -ENOMEM; + + memcpy(new_iters, trans->iters, + sizeof(struct btree_iter) * trans->nr_iters); + trans->iters = new_iters; + + for (i = 0; i < trans->nr_iters; i++) + trans->iters[i].next = &trans->iters[i]; + + if (trans->iters_linked) { + unsigned first_linked = __ffs(trans->iters_linked); + + for (i = first_linked + 1; i < trans->nr_iters; i++) + if (trans->iters_linked & (1 << i)) + bch2_btree_iter_link(&trans->iters[first_linked], + &trans->iters[i]); + } + + btree_trans_verify(trans); + + if (trans->iters_live) { + trans_restart(); + return -EINTR; + } + + return 0; +} + +int bch2_trans_preload_iters(struct btree_trans *trans) +{ + if (trans->iters != trans->iters_onstack) + return 0; + + return btree_trans_realloc_iters(trans); +} + +static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + unsigned btree_id, + unsigned flags, u64 iter_id) +{ + struct btree_iter *iter; + int idx; + + BUG_ON(trans->nr_iters > BTREE_ITER_MAX); + + for (idx = 0; idx < trans->nr_iters; idx++) + if (trans->iter_ids[idx] == iter_id) + goto found; + idx = -1; +found: + if (idx < 0) { + idx = ffz(trans->iters_linked); + if (idx < trans->nr_iters) + goto got_slot; + + BUG_ON(trans->nr_iters == BTREE_ITER_MAX); + + if (trans->iters == trans->iters_onstack && + trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) { + int ret = btree_trans_realloc_iters(trans); + if (ret) + return ERR_PTR(ret); + } + + idx = trans->nr_iters++; +got_slot: + trans->iter_ids[idx] = iter_id; + iter = &trans->iters[idx]; + + bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags); + } else { + iter = &trans->iters[idx]; + + BUG_ON(iter->btree_id != btree_id); + BUG_ON((iter->flags ^ flags) & + (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS)); + + iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + } + + BUG_ON(trans->iters_live & (1 << idx)); + trans->iters_live |= 1 << idx; + + if (trans->iters_linked && + !(trans->iters_linked & (1 << idx))) + bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)], + iter); + + trans->iters_linked |= 1 << idx; + + btree_trans_verify(trans); + + return iter; +} + +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos, unsigned flags, + u64 iter_id) +{ + struct btree_iter *iter = + __btree_trans_get_iter(trans, btree_id, flags, iter_id); + + if (!IS_ERR(iter)) + bch2_btree_iter_set_pos(iter, pos); + return iter; +} + +struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *src, + u64 iter_id) +{ + struct btree_iter *iter = + __btree_trans_get_iter(trans, src->btree_id, + src->flags, iter_id); + + if (!IS_ERR(iter)) + bch2_btree_iter_copy(iter, src); + return iter; +} + +void *bch2_trans_kmalloc(struct btree_trans *trans, + size_t size) +{ + void *ret; + + if (trans->mem_top + size > trans->mem_bytes) { + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(trans->mem_top + size); + void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + + if (!new_mem) + return ERR_PTR(-ENOMEM); + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + if (old_bytes) { + trans_restart(); + return ERR_PTR(-EINTR); + } + } + + ret = trans->mem + trans->mem_top; + trans->mem_top += size; + return ret; +} + +int bch2_trans_unlock(struct btree_trans *trans) +{ + unsigned iters = trans->iters_linked; + int ret = 0; + + while (iters) { + unsigned idx = __ffs(iters); + struct btree_iter *iter = &trans->iters[idx]; + + if (iter->flags & BTREE_ITER_ERROR) + ret = -EIO; + + __bch2_btree_iter_unlock(iter); + iters ^= 1 << idx; + } + + return ret; +} + +void __bch2_trans_begin(struct btree_trans *trans) +{ + unsigned idx; + + btree_trans_verify(trans); + + /* + * On transaction restart, the transaction isn't required to allocate + * all the same iterators it on the last iteration: + * + * Unlink any iterators it didn't use this iteration, assuming it got + * further (allocated an iter with a higher idx) than where the iter + * was originally allocated: + */ + while (trans->iters_linked && + trans->iters_live && + (idx = __fls(trans->iters_linked)) > + __fls(trans->iters_live)) { + trans->iters_linked ^= 1 << idx; + bch2_btree_iter_unlink(&trans->iters[idx]); + } + + trans->iters_live = 0; + trans->nr_updates = 0; + trans->mem_top = 0; + + btree_trans_verify(trans); +} + +void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c) +{ + trans->c = c; + trans->nr_restarts = 0; + trans->nr_iters = 0; + trans->iters_live = 0; + trans->iters_linked = 0; + trans->nr_updates = 0; + trans->mem_top = 0; + trans->mem_bytes = 0; + trans->mem = NULL; + trans->iters = trans->iters_onstack; +} + +int bch2_trans_exit(struct btree_trans *trans) +{ + int ret = bch2_trans_unlock(trans); + + kfree(trans->mem); + if (trans->iters != trans->iters_onstack) + kfree(trans->iters); + trans->mem = (void *) 0x1; + trans->iters = (void *) 0x1; + return ret; } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 5db1cc58..d046ad71 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -269,4 +269,68 @@ static inline int btree_iter_err(struct bkey_s_c k) return PTR_ERR_OR_ZERO(k.k); } +/* new multiple iterator interface: */ + +int bch2_trans_preload_iters(struct btree_trans *); +void bch2_trans_iter_free(struct btree_trans *, + struct btree_iter *); + +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, + struct bpos, unsigned, u64); +struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, + struct btree_iter *, u64); + +static __always_inline u64 __btree_iter_id(void) +{ + u64 ret = 0; + + ret <<= 32; + ret |= _RET_IP_ & U32_MAX; + ret <<= 32; + ret |= _THIS_IP_ & U32_MAX; + return ret; +} + +static __always_inline struct btree_iter * +bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, + struct bpos pos, unsigned flags) +{ + return __bch2_trans_get_iter(trans, btree_id, pos, flags, + __btree_iter_id()); +} + +static __always_inline struct btree_iter * +bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) +{ + + return __bch2_trans_copy_iter(trans, src, __btree_iter_id()); +} + +void __bch2_trans_begin(struct btree_trans *); + +void *bch2_trans_kmalloc(struct btree_trans *, size_t); +int bch2_trans_unlock(struct btree_trans *); +void bch2_trans_init(struct btree_trans *, struct bch_fs *); +int bch2_trans_exit(struct btree_trans *); + +#ifdef TRACE_TRANSACTION_RESTARTS +#define bch2_trans_begin(_trans) \ +do { \ + if (is_power_of_2((_trans)->nr_restarts) && \ + (_trans)->nr_restarts >= 8) \ + pr_info("nr restarts: %zu", (_trans)->nr_restarts); \ + \ + (_trans)->nr_restarts++; \ + __bch2_trans_begin(_trans); \ +} while (0) +#else +#define bch2_trans_begin(_trans) __bch2_trans_begin(_trans) +#endif + +#ifdef TRACE_TRANSACTION_RESTARTS_ALL +#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__) +#else +#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__) +#endif + #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index daa648c6..39e2db75 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -253,6 +253,40 @@ struct btree_iter { struct btree_iter *next; }; +#define BTREE_ITER_MAX 8 + +struct btree_insert_entry { + struct btree_iter *iter; + struct bkey_i *k; + unsigned extra_res; + /* + * true if entire key was inserted - can only be false for + * extents + */ + bool done; +}; + +struct btree_trans { + struct bch_fs *c; + size_t nr_restarts; + + u8 nr_iters; + u8 iters_live; + u8 iters_linked; + u8 nr_updates; + + unsigned mem_top; + unsigned mem_bytes; + void *mem; + + struct btree_iter *iters; + u64 iter_ids[BTREE_ITER_MAX]; + + struct btree_insert_entry updates[BTREE_ITER_MAX]; + + struct btree_iter iters_onstack[2]; +}; + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index aac97958..5e47d4cd 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -27,16 +27,7 @@ struct btree_insert { bool did_work; unsigned short nr; - struct btree_insert_entry { - struct btree_iter *iter; - struct bkey_i *k; - unsigned extra_res; - /* - * true if entire key was inserted - can only be false for - * extents - */ - bool done; - } *entries; + struct btree_insert_entry *entries; }; int __bch2_btree_insert_at(struct btree_insert *); @@ -149,4 +140,31 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, struct btree *, struct bkey_i_extent *); +/* new transactional interface: */ + +void bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, unsigned); +int bch2_trans_commit(struct btree_trans *, + struct disk_reservation *, + struct extent_insert_hook *, + u64 *, unsigned); + +#define bch2_trans_do(_c, _journal_seq, _flags, _do) \ +({ \ + struct btree_trans trans; \ + int _ret; \ + \ + bch2_trans_init(&trans, (_c)); \ + \ + do { \ + bch2_trans_begin(&trans); \ + \ + _ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL, \ + (_journal_seq), (_flags)); \ + } while (_ret == -EINTR); \ + \ + bch2_trans_exit(&trans); \ + _ret; \ +}) + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 588a1997..a481b0d6 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -309,8 +309,10 @@ static inline int do_btree_insert_at(struct btree_insert *trans, unsigned u64s; int ret; - trans_for_each_entry(trans, i) + trans_for_each_entry(trans, i) { BUG_ON(i->done); + BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); + } u64s = 0; trans_for_each_entry(trans, i) @@ -330,6 +332,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans, if (race_fault()) { ret = -EINTR; + trans_restart(" (race)"); goto out; } @@ -354,10 +357,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans, } } - if (journal_seq_verify(c) && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - trans_for_each_entry(trans, i) - i->k->k.version.lo = trans->journal_res.seq; + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (journal_seq_verify(c)) + trans_for_each_entry(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (inject_invalid_keys(c)) + trans_for_each_entry(trans, i) + i->k->k.version = MAX_VERSION; + } trans_for_each_entry(trans, i) { switch (btree_insert_key_leaf(trans, i)) { @@ -398,6 +405,17 @@ out: return ret; } +static inline void btree_insert_entry_checks(struct bch_fs *c, + struct btree_insert_entry *i) +{ + BUG_ON(i->iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + BUG_ON(debug_check_bkeys(c) && + !bkey_deleted(&i->k->k) && + bch2_bkey_invalid(c, i->iter->btree_id, + bkey_i_to_s_c(i->k))); +} + /** * __bch_btree_insert_at - insert keys at given iterator positions * @@ -418,20 +436,16 @@ int __bch2_btree_insert_at(struct btree_insert *trans) unsigned flags; int ret; + BUG_ON(!trans->nr); + for_each_btree_iter(trans->entries[0].iter, linked) bch2_btree_iter_verify_locks(linked); /* for the sake of sanity: */ BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); - trans_for_each_entry(trans, i) { - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - BUG_ON(debug_check_bkeys(c) && - !bkey_deleted(&i->k->k) && - bch2_bkey_invalid(c, i->iter->btree_id, - bkey_i_to_s_c(i->k))); - } + trans_for_each_entry(trans, i) + btree_insert_entry_checks(c, i); bubble_sort(trans->entries, trans->nr, btree_trans_cmp); @@ -442,7 +456,12 @@ retry: cycle_gc_lock = false; trans_for_each_entry(trans, i) { + unsigned old_locks_want = i->iter->locks_want; + unsigned old_uptodate = i->iter->uptodate; + if (!bch2_btree_iter_upgrade(i->iter, 1, true)) { + trans_restart(" (failed upgrade, locks_want %u uptodate %u)", + old_locks_want, old_uptodate); ret = -EINTR; goto err; } @@ -515,8 +534,10 @@ err: * don't care if we got ENOSPC because we told split it * couldn't block: */ - if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) + if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) { + trans_restart(" (split)"); ret = -EINTR; + } } if (cycle_gc_lock) { @@ -531,13 +552,16 @@ err: } if (ret == -EINTR) { - if (flags & BTREE_INSERT_NOUNLOCK) + if (flags & BTREE_INSERT_NOUNLOCK) { + trans_restart(" (can't unlock)"); goto out; + } trans_for_each_entry(trans, i) { int ret2 = bch2_btree_iter_traverse(i->iter); if (ret2) { ret = ret2; + trans_restart(" (traverse)"); goto out; } @@ -550,11 +574,56 @@ err: */ if (!(flags & BTREE_INSERT_ATOMIC)) goto retry; + + trans_restart(" (atomic)"); } goto out; } +void bch2_trans_update(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k, + unsigned extra_journal_res) +{ + struct btree_insert_entry *i; + + BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates)); + + i = &trans->updates[trans->nr_updates++]; + + *i = (struct btree_insert_entry) { + .iter = iter, + .k = k, + .extra_res = extra_journal_res, + }; + + btree_insert_entry_checks(trans->c, i); +} + +int bch2_trans_commit(struct btree_trans *trans, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq, + unsigned flags) +{ + struct btree_insert insert = { + .c = trans->c, + .disk_res = disk_res, + .journal_seq = journal_seq, + .flags = flags, + .nr = trans->nr_updates, + .entries = trans->updates, + }; + + if (!trans->nr_updates) + return 0; + + trans->nr_updates = 0; + + return __bch2_btree_insert_at(&insert); +} + int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags) { struct bkey_i k; diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index d3dd3eb7..d979ae0e 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -141,8 +141,8 @@ void bch2_dirent_to_text(struct bch_fs *c, char *buf, } } -static struct bkey_i_dirent *dirent_create_key(u8 type, - const struct qstr *name, u64 dst) +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + u8 type, const struct qstr *name, u64 dst) { struct bkey_i_dirent *dirent; unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); @@ -152,9 +152,9 @@ static struct bkey_i_dirent *dirent_create_key(u8 type, BUG_ON(u64s > U8_MAX); - dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (!dirent) - return ERR_PTR(-ENOMEM); + dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; bkey_dirent_init(&dirent->k_i); dirent->k.u64s = u64s; @@ -172,23 +172,31 @@ static struct bkey_i_dirent *dirent_create_key(u8 type, return dirent; } +int __bch2_dirent_create(struct btree_trans *trans, + u64 dir_inum, const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + int flags) +{ + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(trans, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; + + return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, &dirent->k_i, flags); +} + int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *journal_seq, int flags) { - struct bkey_i_dirent *dirent; - int ret; - - dirent = dirent_create_key(type, name, dst_inum); - if (IS_ERR(dirent)) - return PTR_ERR(dirent); - - ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum, - journal_seq, &dirent->k_i, flags); - kfree(dirent); - - return ret; + return bch2_trans_do(c, journal_seq, flags, + __bch2_dirent_create(&trans, dir_inum, hash_info, + type, name, dst_inum, flags)); } static void dirent_copy_target(struct bkey_i_dirent *dst, @@ -204,151 +212,117 @@ static struct bpos bch2_dirent_pos(struct bch_inode_info *inode, return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name)); } -int bch2_dirent_rename(struct bch_fs *c, +int bch2_dirent_rename(struct btree_trans *trans, struct bch_inode_info *src_dir, const struct qstr *src_name, struct bch_inode_info *dst_dir, const struct qstr *dst_name, - u64 *journal_seq, enum bch_rename_mode mode) + enum bch_rename_mode mode) { - struct btree_iter src_iter, dst_iter, whiteout_iter; + struct btree_iter *src_iter, *dst_iter; struct bkey_s_c old_src, old_dst; - struct bkey delete; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos src_pos = bch2_dirent_pos(src_dir, src_name); struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name); - bool need_whiteout; int ret; - bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_link(&src_iter, &dst_iter); - - bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, - BTREE_ITER_SLOTS); - bch2_btree_iter_link(&src_iter, &whiteout_iter); - - if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(0, src_name, 0); - if (IS_ERR(new_src)) { - ret = PTR_ERR(new_src); - goto err; - } - } else { - new_src = (void *) &delete; - } - - new_dst = dirent_create_key(0, dst_name, 0); - if (IS_ERR(new_dst)) { - ret = PTR_ERR(new_dst); - goto err; - } -retry: - /* - * Note that on -EINTR/dropped locks we're not restarting the lookup - * from the original hashed position (like we do when creating dirents, - * in bch_hash_set) - we never move existing dirents to different slot: - */ - old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - &src_iter, src_name); - if ((ret = btree_iter_err(old_src))) - goto err; - - ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - &whiteout_iter, &src_iter); - if (ret < 0) - goto err; - need_whiteout = ret; - /* + * Lookup dst: + * * Note that in BCH_RENAME mode, we're _not_ checking if * the target already exists - we're relying on the VFS * to do that check for us for correctness: */ - old_dst = mode == BCH_RENAME - ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter) - : bch2_hash_lookup_at(bch2_dirent_hash_desc, - &dst_dir->ei_str_hash, - &dst_iter, dst_name); - if ((ret = btree_iter_err(old_dst))) - goto err; + dst_iter = mode == BCH_RENAME + ? bch2_hash_hole(trans, bch2_dirent_hash_desc, + &dst_dir->ei_str_hash, + dst_dir->v.i_ino, dst_name) + : bch2_hash_lookup(trans, bch2_dirent_hash_desc, + &dst_dir->ei_str_hash, + dst_dir->v.i_ino, dst_name, + BTREE_ITER_INTENT); + if (IS_ERR(dst_iter)) + return PTR_ERR(dst_iter); + old_dst = bch2_btree_iter_peek_slot(dst_iter); - switch (mode) { - case BCH_RENAME: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + /* Lookup src: */ + src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, + &src_dir->ei_str_hash, + src_dir->v.i_ino, src_name, + BTREE_ITER_INTENT); + if (IS_ERR(src_iter)) + return PTR_ERR(src_iter); + old_src = bch2_btree_iter_peek_slot(src_iter); - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * If we couldn't insert new_dst at its hashed - * position (dst_pos) due to a hash collision, - * and we're going to be deleting in - * between the hashed position and first empty - * slot we found - just overwrite the pos we - * were going to delete: - * - * Note: this is a correctness issue, in this - * situation bch2_hash_needs_whiteout() could - * return false when the whiteout would have - * been needed if we inserted at the pos - * __dirent_find_hole() found - */ - new_dst->k.p = src_iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, - journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, - &new_dst->k_i)); - goto err; - } + /* Create new dst key: */ + new_dst = dirent_create_key(trans, 0, dst_name, 0); + if (IS_ERR(new_dst)) + return PTR_ERR(new_dst); - if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_OVERWRITE: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + new_dst->k.p = dst_iter->pos; + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { + new_src = dirent_create_key(trans, 0, src_name, 0); + if (IS_ERR(new_src)) + return PTR_ERR(new_src); - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * Same case described above - - * bch_hash_needs_whiteout could spuriously - * return false, but we have to insert at - * dst_iter.pos because we're overwriting - * another dirent: - */ - new_src->k.type = BCH_DIRENT_WHITEOUT; - } else if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_EXCHANGE: dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - break; + new_src->k.p = src_iter->pos; + } else { + new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + if (IS_ERR(new_src)) + return PTR_ERR(new_src); + bkey_init(&new_src->k); + new_src->k.p = src_iter->pos; + + if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && + bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { + /* + * We have a hash collision for the new dst key, + * and new_src - the key we're deleting - is between + * new_dst's hashed slot and the slot we're going to be + * inserting it into - oops. This will break the hash + * table if we don't deal with it: + */ + if (mode == BCH_RENAME) { + /* + * If we're not overwriting, we can just insert + * new_dst at the src position: + */ + new_dst->k.p = src_iter->pos; + bch2_trans_update(trans, src_iter, &new_dst->k_i, 0); + return 0; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to + * overwrite old_dst - just make sure to use a + * whiteout when deleting src: + */ + new_src->k.type = BCH_DIRENT_WHITEOUT; + } + } else { + /* Check if we need a whiteout to delete src: */ + ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, + &src_dir->ei_str_hash, + src_iter); + if (ret < 0) + return ret; + + if (ret) + new_src->k.type = BCH_DIRENT_WHITEOUT; + } } - new_src->k.p = src_iter.pos; - new_dst->k.p = dst_iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i), - BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i)); -err: - if (ret == -EINTR) - goto retry; + bch2_trans_update(trans, src_iter, &new_src->k_i, 0); + bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); + return 0; +} - bch2_btree_iter_unlock(&whiteout_iter); - bch2_btree_iter_unlock(&dst_iter); - bch2_btree_iter_unlock(&src_iter); - - if (new_src != (void *) &delete) - kfree(new_src); - kfree(new_dst); - return ret; +int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum, + const struct bch_hash_info *hash_info, + const struct qstr *name) +{ + return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, name); } int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, @@ -356,28 +330,34 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, const struct qstr *name, u64 *journal_seq) { - return bch2_hash_delete(bch2_dirent_hash_desc, hash_info, - c, dir_inum, journal_seq, name); + return bch2_trans_do(c, journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + __bch2_dirent_delete(&trans, dir_inum, hash_info, name)); } u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, const struct bch_hash_info *hash_info, const struct qstr *name) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; - u64 inum; + u64 inum = 0; - k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c, - dir_inum, &iter, name); - if (IS_ERR(k.k)) { - bch2_btree_iter_unlock(&iter); - return 0; + bch2_trans_init(&trans, c); + + iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc, + hash_info, dir_inum, name, 0); + if (IS_ERR(iter)) { + BUG_ON(PTR_ERR(iter) == -EINTR); + goto out; } + k = bch2_btree_iter_peek_slot(iter); inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - bch2_btree_iter_unlock(&iter); - +out: + bch2_trans_exit(&trans); return inum; } diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 5d066af1..4d92ffba 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -21,8 +21,16 @@ struct bch_hash_info; struct bch_inode_info; unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); + +int __bch2_dirent_create(struct btree_trans *, u64, + const struct bch_hash_info *, u8, + const struct qstr *, u64, int); int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); + +int __bch2_dirent_delete(struct btree_trans *, u64, + const struct bch_hash_info *, + const struct qstr *); int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *, u64 *); @@ -32,10 +40,10 @@ enum bch_rename_mode { BCH_RENAME_EXCHANGE, }; -int bch2_dirent_rename(struct bch_fs *, +int bch2_dirent_rename(struct btree_trans *, struct bch_inode_info *, const struct qstr *, struct bch_inode_info *, const struct qstr *, - u64 *, enum bch_rename_mode); + enum bch_rename_mode); u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *); diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 2a357fc3..9505b6e6 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -131,8 +131,9 @@ print: mutex_unlock(&c->fsck_error_lock); - if (fix) - set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags); + set_bit(fix + ? BCH_FS_FSCK_FIXED_ERRORS + : BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags); return fix ? FSCK_ERR_FIX : flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE diff --git a/libbcachefs/error.h b/libbcachefs/error.h index f65ef132..588e763f 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -147,12 +147,18 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define need_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +#define need_fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) + #define mustfix_fsck_err(c, ...) \ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) #define mustfix_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) +#define fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) + #define fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 9e78798a..e4d2b39e 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -193,7 +193,7 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c, struct bch_inode_info *inode, loff_t new_size) { - return __bch2_write_inode(c, inode, inode_set_size, &new_size); + return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0); } static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, @@ -259,7 +259,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) mutex_lock(&h->inode->ei_update_lock); i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h); + ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0); if (!ret && h->new_i_size != U64_MAX) i_size_write(&h->inode->v, h->new_i_size); @@ -289,7 +289,7 @@ static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h) int ret; mutex_lock(&h->inode->ei_update_lock); - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h); + ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0); mutex_unlock(&h->inode->ei_update_lock); return ret; @@ -390,7 +390,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop) struct bchfs_write_op *op = container_of(wop, struct bchfs_write_op, op); struct keylist *keys = &op->op.insert_keys; - struct btree_iter extent_iter, inode_iter; + struct btree_trans trans; + struct btree_iter *extent_iter, *inode_iter = NULL; struct bchfs_extent_trans_hook hook; struct bkey_i *k = bch2_keylist_front(keys); s64 orig_sectors_added = op->sectors_added; @@ -398,12 +399,13 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BUG_ON(k->k.p.inode != op->inode->v.i_ino); - bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); - bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES, - POS(extent_iter.pos.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, wop->c); + + extent_iter = bch2_trans_get_iter(&trans, + BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_INTENT); + BUG_ON(IS_ERR(extent_iter)); hook.op = op; hook.hook.fn = bchfs_extent_update_hook; @@ -416,19 +418,29 @@ static int bchfs_write_index_update(struct bch_write_op *wop) op->inode->ei_inode.bi_size) hook.need_inode_update = true; + /* optimization for fewer transaction restarts: */ + ret = bch2_btree_iter_traverse(extent_iter); + if (ret) + goto err; + if (hook.need_inode_update) { struct bkey_s_c inode; - if (!btree_iter_linked(&inode_iter)) - bch2_btree_iter_link(&extent_iter, &inode_iter); + if (!inode_iter) { + inode_iter = bch2_trans_get_iter(&trans, + BTREE_ID_INODES, + POS(extent_iter->pos.inode, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BUG_ON(IS_ERR(inode_iter)); + } - inode = bch2_btree_iter_peek_slot(&inode_iter); + inode = bch2_btree_iter_peek_slot(inode_iter); if ((ret = btree_iter_err(inode))) goto err; if (WARN_ONCE(inode.k->type != BCH_INODE_FS, "inode %llu not found when updating", - extent_iter.pos.inode)) { + extent_iter->pos.inode)) { ret = -ENOENT; break; } @@ -436,7 +448,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) if (WARN_ONCE(bkey_bytes(inode.k) > sizeof(hook.inode_p), "inode %llu too big (%zu bytes, buf %zu)", - extent_iter.pos.inode, + extent_iter->pos.inode, bkey_bytes(inode.k), sizeof(hook.inode_p))) { ret = -ENOENT; @@ -448,7 +460,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) &hook.inode_u); if (WARN_ONCE(ret, "error %i unpacking inode %llu", - ret, extent_iter.pos.inode)) { + ret, extent_iter->pos.inode)) { ret = -ENOENT; break; } @@ -458,8 +470,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BTREE_INSERT_NOFAIL| BTREE_INSERT_ATOMIC| BTREE_INSERT_USE_RESERVE, - BTREE_INSERT_ENTRY(&extent_iter, k), - BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter, + BTREE_INSERT_ENTRY(extent_iter, k), + BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter, &hook.inode_p.inode.k_i, 2)); } else { ret = bch2_btree_insert_at(wop->c, &wop->res, @@ -467,10 +479,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BTREE_INSERT_NOFAIL| BTREE_INSERT_ATOMIC| BTREE_INSERT_USE_RESERVE, - BTREE_INSERT_ENTRY(&extent_iter, k)); + BTREE_INSERT_ENTRY(extent_iter, k)); } - BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k))); + BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k))); if (WARN_ONCE(!ret != !k->k.size, "ret %i k->size %u", ret, k->k.size)) @@ -481,12 +493,11 @@ err: if (ret) break; - BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0); + BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0); bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); - bch2_btree_iter_unlock(&extent_iter); - bch2_btree_iter_unlock(&inode_iter); + bch2_trans_exit(&trans); if (op->is_dio) { struct dio_write *dio = container_of(op, struct dio_write, iop); @@ -2338,8 +2349,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct btree_iter src; - struct btree_iter dst; + struct btree_trans trans; + struct btree_iter *src, *dst; BKEY_PADDED(k) copy; struct bkey_s_c k; struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); @@ -2349,13 +2360,17 @@ static long bch2_fcollapse(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS, + bch2_trans_init(&trans, c); + + dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BUG_ON(IS_ERR(dst)); + /* position will be set from dst iter's position: */ - bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, + src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_SLOTS); - bch2_btree_iter_link(&src, &dst); + BUG_ON(IS_ERR(src)); /* * We need i_mutex to keep the page cache consistent with the extents @@ -2384,24 +2399,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode, if (ret) goto err; - while (bkey_cmp(dst.pos, + while (bkey_cmp(dst->pos, POS(inode->v.i_ino, round_up(new_size, PAGE_SIZE) >> 9)) < 0) { struct disk_reservation disk_res; - bch2_btree_iter_set_pos(&src, - POS(dst.pos.inode, dst.pos.offset + (len >> 9))); + bch2_btree_iter_set_pos(src, + POS(dst->pos.inode, dst->pos.offset + (len >> 9))); - k = bch2_btree_iter_peek_slot(&src); + k = bch2_btree_iter_peek_slot(src); if ((ret = btree_iter_err(k))) goto btree_iter_err; bkey_reassemble(©.k, k); - bch2_cut_front(src.pos, ©.k); + bch2_cut_front(src->pos, ©.k); copy.k.k.p.offset -= len >> 9; - BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(©.k.k))); + BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k))); ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size, bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(©.k)), @@ -2412,14 +2427,13 @@ static long bch2_fcollapse(struct bch_inode_info *inode, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&dst, ©.k)); + BTREE_INSERT_ENTRY(dst, ©.k)); bch2_disk_reservation_put(c, &disk_res); btree_iter_err: if (ret == -EINTR) ret = 0; if (ret) { - bch2_btree_iter_unlock(&src); - bch2_btree_iter_unlock(&dst); + bch2_trans_exit(&trans); goto err_put_sectors_dirty; } /* @@ -2427,11 +2441,10 @@ btree_iter_err: * pointers... which isn't a _super_ serious problem... */ - bch2_btree_iter_cond_resched(&src); + bch2_btree_iter_cond_resched(src); } - bch2_btree_iter_unlock(&src); - bch2_btree_iter_unlock(&dst); + bch2_trans_exit(&trans); ret = bch2_inode_truncate(c, inode->v.i_ino, round_up(new_size, block_bytes(c)) >> 9, diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 2c1ecf77..336dbd4b 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -87,6 +87,8 @@ void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) struct flags_set { unsigned mask; unsigned flags; + + unsigned projid; }; static int bch2_inode_flags_set(struct bch_inode_info *inode, @@ -150,7 +152,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s); + ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0); if (!ret) bch2_inode_flags_to_vfs(inode); @@ -185,9 +187,9 @@ static int bch2_set_projid(struct bch_fs *c, qid.q[QTYP_PRJ] = projid; - ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved); + return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid, + inode->v.i_blocks + + inode->ei_quota_reserved); if (ret) return ret; @@ -195,6 +197,17 @@ static int bch2_set_projid(struct bch_fs *c, return 0; } +static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct flags_set *s = p; + + bi->bi_project = s->projid; + + return bch2_inode_flags_set(inode, bi, p); +} + static int bch2_ioc_fssetxattr(struct bch_fs *c, struct file *file, struct bch_inode_info *inode, @@ -211,6 +224,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (fa.fsx_xflags) return -EOPNOTSUPP; + s.projid = fa.fsx_projid; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -226,7 +241,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (ret) goto err_unlock; - ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s); + ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0); if (!ret) bch2_inode_flags_to_vfs(inode); err_unlock: diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 3b7f78e7..c51a65da 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -34,6 +34,19 @@ static void bch2_vfs_inode_init(struct bch_fs *, struct bch_inode_info *, struct bch_inode_unpacked *); +static void journal_seq_copy(struct bch_inode_info *dst, + u64 journal_seq) +{ + u64 old, v = READ_ONCE(dst->ei_journal_seq); + + do { + old = v; + + if (old >= journal_seq) + break; + } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); +} + /* * I_SIZE_DIRTY requires special handling: * @@ -62,127 +75,113 @@ static void bch2_vfs_inode_init(struct bch_fs *, * be set explicitly. */ -int __must_check __bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p) +void bch2_inode_update_after_write(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + unsigned fields) { - struct btree_iter iter; - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - u64 inum = inode->v.i_ino; - unsigned i_nlink = READ_ONCE(inode->v.i_nlink); - int ret; + set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED + ? 0 + : bi->bi_nlink + nlink_bias(inode->v.i_mode)); + i_uid_write(&inode->v, bi->bi_uid); + i_gid_write(&inode->v, bi->bi_gid); + inode->v.i_mode = bi->bi_mode; - /* - * We can't write an inode with i_nlink == 0 because it's stored biased; - * however, we don't need to because if i_nlink is 0 the inode is - * getting deleted when it's evicted. - */ - if (!i_nlink) - return 0; + if (fields & ATTR_ATIME) + inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); + if (fields & ATTR_MTIME) + inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); + if (fields & ATTR_CTIME) + inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); + + inode->ei_inode = *bi; + inode->ei_qid = bch_qid(bi); +} + +int __must_check bch2_write_inode_trans(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + inode_set_fn set, + void *p) +{ + struct btree_iter *iter; + struct bkey_inode_buf *inode_p; + struct bkey_s_c k; + u64 inum = inode->v.i_ino; + int ret; lockdep_assert_held(&inode->ei_update_lock); - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); - do { - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(iter); + if ((ret = btree_iter_err(k))) + return ret; - if ((ret = btree_iter_err(k))) - goto out; + if (WARN_ONCE(k.k->type != BCH_INODE_FS, + "inode %llu not found when updating", inum)) + return -ENOENT; - if (WARN_ONCE(k.k->type != BCH_INODE_FS, - "inode %llu not found when updating", inum)) { - bch2_btree_iter_unlock(&iter); - return -ENOENT; - } + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u); + if (WARN_ONCE(ret, + "error %i unpacking inode %llu", ret, inum)) + return -ENOENT; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", ret, inum)) { - ret = -ENOENT; - break; - } + BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size); - BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size); + BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size && + !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) && + inode_u->bi_size > i_size_read(&inode->v)); - if (set) { - ret = set(inode, &inode_u, p); - if (ret) - goto out; - } - - BUG_ON(i_nlink < nlink_bias(inode->v.i_mode)); - - BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size && - !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - inode_u.bi_size > i_size_read(&inode->v)); - - inode_u.bi_mode = inode->v.i_mode; - inode_u.bi_uid = i_uid_read(&inode->v); - inode_u.bi_gid = i_gid_read(&inode->v); - inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ]; - inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode); - inode_u.bi_dev = inode->v.i_rdev; - inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime); - inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime); - inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime); - - bch2_inode_pack(&inode_p, &inode_u); - - ret = bch2_btree_insert_at(c, NULL, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); - } while (ret == -EINTR); - - if (!ret) { - /* - * the btree node lock protects inode->ei_inode, not - * ei_update_lock; this is important for inode updates via - * bchfs_write_index_update - */ - inode->ei_inode = inode_u; - inode->ei_qid = bch_qid(&inode_u); + if (set) { + ret = set(inode, inode_u, p); + if (ret) + return ret; } -out: - bch2_btree_iter_unlock(&iter); - return ret < 0 ? ret : 0; + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode_u); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; } -int __must_check bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode) -{ - return __bch2_write_inode(c, inode, NULL, NULL); -} - -static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode) +int __must_check __bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *inode, + inode_set_fn set, + void *p, unsigned fields) { + struct btree_trans trans; + struct bch_inode_unpacked inode_u; int ret; - mutex_lock(&inode->ei_update_lock); - inc_nlink(&inode->v); - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); - return ret; -} + ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?: + bch2_trans_commit(&trans, NULL, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + goto retry; -static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode) -{ - int ret = 0; + /* + * the btree node lock protects inode->ei_inode, not ei_update_lock; + * this is important for inode updates via bchfs_write_index_update + */ + if (!ret) + bch2_inode_update_after_write(c, inode, &inode_u, fields); - mutex_lock(&inode->ei_update_lock); - drop_nlink(&inode->v); - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); - - return ret; + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) @@ -212,125 +211,173 @@ static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } -static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c, - struct bch_inode_info *dir, - umode_t mode, dev_t rdev) +static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u, + const struct inode *dir, umode_t mode) { - struct posix_acl *default_acl = NULL, *acl = NULL; - struct bch_inode_info *inode; + kuid_t uid = current_fsuid(); + kgid_t gid; + + if (dir && dir->i_mode & S_ISGID) { + gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + gid = current_fsgid(); + + inode_u->bi_uid = from_kuid(dir->i_sb->s_user_ns, uid); + inode_u->bi_gid = from_kgid(dir->i_sb->s_user_ns, gid); + inode_u->bi_mode = mode; +} + +static int inode_update_for_create_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_inode_unpacked *new_inode = p; + struct timespec now = current_time(&inode->v); + + bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now); + + if (S_ISDIR(new_inode->bi_mode)) + bi->bi_nlink++; + + return 0; +} + +static struct bch_inode_info * +__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, bool tmpfile) +{ + struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct btree_trans trans; + struct bch_inode_unpacked dir_u; + struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; + struct bch_hash_info hash_info; + struct posix_acl *default_acl = NULL, *acl = NULL; int ret; - inode = to_bch_ei(new_inode(c->vfs_sb)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - - inode_init_owner(&inode->v, &dir->v, mode); - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl); - if (ret) - goto err_make_bad; -#endif - - bch2_inode_init(c, &inode_u, - i_uid_read(&inode->v), - i_gid_read(&inode->v), - inode->v.i_mode, rdev, - &dir->ei_inode); + bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode); + bch2_inode_init_owner(&inode_u, &dir->v, mode); inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ]; + hash_info = bch2_hash_info_init(c, &inode_u); + + if (tmpfile) + inode_u.bi_flags |= BCH_INODE_UNLINKED; + ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC); if (ret) - goto err_make_bad; + return ERR_PTR(ret); - ret = bch2_inode_create(c, &inode_u, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); +#ifdef CONFIG_BCACHEFS_POSIX_ACL + ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl); + if (ret) + goto err; +#endif + + /* + * preallocate vfs inode before btree transaction, so that nothing can + * fail after the transaction succeeds: + */ + inode = to_bch_ei(new_inode(c->vfs_sb)); + if (unlikely(!inode)) { + ret = -ENOMEM; + goto err; + } + + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_inode_create(&trans, &inode_u, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint) ?: + (default_acl + ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, + default_acl, ACL_TYPE_DEFAULT) + : 0) ?: + (acl + ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, + acl, ACL_TYPE_ACCESS) + : 0) ?: + (!tmpfile + ? __bch2_dirent_create(&trans, dir->v.i_ino, + &dir->ei_str_hash, + mode_to_type(mode), + &dentry->d_name, + inode_u.bi_inum, + BCH_HASH_SET_MUST_CREATE) + : 0) ?: + (!tmpfile + ? bch2_write_inode_trans(&trans, dir, &dir_u, + inode_update_for_create_fn, + &inode_u) + : 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; if (unlikely(ret)) - goto err_acct_quota; + goto err_trans; - bch2_vfs_inode_init(c, inode, &inode_u); atomic_long_inc(&c->nr_inodes); - if (default_acl) { - ret = __bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT); - if (unlikely(ret)) - goto err; + if (!tmpfile) { + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(dir, inode->ei_journal_seq); } - if (acl) { - ret = __bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS); - if (unlikely(ret)) - goto err; + bch2_vfs_inode_init(c, inode, &inode_u); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); + + /* + * we must insert the new inode into the inode cache before calling + * bch2_trans_exit() and dropping locks, else we could race with another + * thread pulling the inode in and modifying it: + */ + + old = to_bch_ei(insert_inode_locked2(&inode->v)); + if (unlikely(old)) { + /* + * We raced, another process pulled the new inode into cache + * before us: + */ + old->ei_journal_seq = inode->ei_journal_seq; + make_bad_inode(&inode->v); + iput(&inode->v); + + inode = old; + } else { + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); } - insert_inode_hash(&inode->v); + bch2_trans_exit(&trans); out: posix_acl_release(default_acl); posix_acl_release(acl); return inode; -err_acct_quota: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN); -err_make_bad: - /* - * indicate to bch_evict_inode that the inode was never actually - * created: - */ +err_trans: + bch2_trans_exit(&trans); make_bad_inode(&inode->v); -err: - clear_nlink(&inode->v); iput(&inode->v); +err: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN); inode = ERR_PTR(ret); goto out; } -static int bch2_vfs_dirent_create(struct bch_fs *c, - struct bch_inode_info *dir, - u8 type, const struct qstr *name, - u64 dst) -{ - int ret; - - ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash, - type, name, dst, - &dir->ei_journal_seq, - BCH_HASH_SET_MUST_CREATE); - if (unlikely(ret)) - return ret; - - dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v); - mark_inode_dirty_sync(&dir->v); - return 0; -} - -static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct bch_inode_info *inode; - int ret; - - inode = bch2_vfs_inode_create(c, dir, mode, rdev); - if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); - - ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode), - &dentry->d_name, inode->v.i_ino); - if (unlikely(ret)) { - clear_nlink(&inode->v); - iput(&inode->v); - return ret; - } - - if (dir->ei_journal_seq > inode->ei_journal_seq) - inode->ei_journal_seq = dir->ei_journal_seq; - - d_instantiate(dentry, &inode->v); - return 0; -} - /* methods */ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, @@ -354,7 +401,70 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, static int bch2_create(struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { - return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0); + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + d_instantiate(dentry, &inode->v); + return 0; +} + +static int inode_update_for_link_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct timespec now = current_time(&inode->v); + + bi->bi_ctime = timespec_to_bch2_time(c, now); + + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else + bi->bi_nlink++; + + return 0; +} + +static int __bch2_link(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_inode_info *dir, + struct dentry *dentry) +{ + struct btree_trans trans; + struct bch_inode_unpacked inode_u; + int ret; + + lockdep_assert_held(&inode->v.i_rwsem); + + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_dirent_create(&trans, dir->v.i_ino, + &dir->ei_str_hash, + mode_to_type(inode->v.i_mode), + &dentry->d_name, + inode->v.i_ino, + BCH_HASH_SET_MUST_CREATE) ?: + bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_link_fn, + NULL) ?: + bch2_trans_commit(&trans, NULL, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + + if (ret == -EINTR) + goto retry; + + if (likely(!ret)) + bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + + bch2_trans_exit(&trans); + return ret; } static int bch2_link(struct dentry *old_dentry, struct inode *vdir, @@ -365,55 +475,89 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); int ret; - lockdep_assert_held(&inode->v.i_rwsem); - - inode->v.i_ctime = current_time(&dir->v); - - ret = bch2_inc_nlink(c, inode); - if (ret) + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) return ret; ihold(&inode->v); - - ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode), - &dentry->d_name, inode->v.i_ino); - if (unlikely(ret)) { - bch2_dec_nlink(c, inode); - iput(&inode->v); - return ret; - } - d_instantiate(dentry, &inode->v); return 0; } +static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_inode_info *unlink_inode = p; + struct timespec now = current_time(&inode->v); + + bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now); + + bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode); + + return 0; +} + +static int inode_update_for_unlink_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct timespec now = current_time(&inode->v); + + bi->bi_ctime = timespec_to_bch2_time(c, now); + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; + + return 0; +} + static int bch2_unlink(struct inode *vdir, struct dentry *dentry) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_unpacked dir_u, inode_u; + struct btree_trans trans; int ret; - lockdep_assert_held(&inode->v.i_rwsem); + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); - ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash, - &dentry->d_name, &dir->ei_journal_seq); + ret = __bch2_dirent_delete(&trans, dir->v.i_ino, + &dir->ei_str_hash, + &dentry->d_name) ?: + bch2_write_inode_trans(&trans, dir, &dir_u, + inode_update_dir_for_unlink_fn, + inode) ?: + bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_unlink_fn, + NULL) ?: + bch2_trans_commit(&trans, NULL, NULL, + &dir->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + goto retry; if (ret) - return ret; + goto err; if (dir->ei_journal_seq > inode->ei_journal_seq) inode->ei_journal_seq = dir->ei_journal_seq; - inode->v.i_ctime = dir->v.i_ctime; + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, + ATTR_MTIME); +err: + bch2_trans_exit(&trans); - if (S_ISDIR(inode->v.i_mode)) { - bch2_dec_nlink(c, dir); - drop_nlink(&inode->v); - } - - bch2_dec_nlink(c, inode); - - return 0; + return ret; } static int bch2_symlink(struct inode *vdir, struct dentry *dentry, @@ -423,7 +567,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0); + inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -438,37 +582,28 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry, if (unlikely(ret)) goto err; - /* XXX: racy */ - if (dir->ei_journal_seq < inode->ei_journal_seq) - dir->ei_journal_seq = inode->ei_journal_seq; + journal_seq_copy(dir, inode->ei_journal_seq); - ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, - inode->v.i_ino); + ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; d_instantiate(dentry, &inode->v); return 0; err: - clear_nlink(&inode->v); iput(&inode->v); return ret; } static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) { - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - int ret; + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false); - lockdep_assert_held(&dir->v.i_rwsem); - - ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0); - if (unlikely(ret)) - return ret; - - bch2_inc_nlink(c, dir); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, &inode->v); return 0; } @@ -485,151 +620,197 @@ static int bch2_rmdir(struct inode *vdir, struct dentry *dentry) static int bch2_mknod(struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { - return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev); + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + d_instantiate(dentry, &inode->v); + return 0; } -static int bch2_rename(struct bch_fs *c, - struct bch_inode_info *old_dir, - struct dentry *old_dentry, - struct bch_inode_info *new_dir, - struct dentry *new_dentry) +struct rename_info { + u64 now; + struct bch_inode_info *src_dir; + struct bch_inode_info *dst_dir; + struct bch_inode_info *src_inode; + struct bch_inode_info *dst_inode; + enum bch_rename_mode mode; +}; + +static int inode_update_for_rename_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); - struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); - struct timespec now = current_time(&old_dir->v); - int ret; + struct rename_info *info = p; - lockdep_assert_held(&old_dir->v.i_rwsem); - lockdep_assert_held(&new_dir->v.i_rwsem); - - if (new_inode) - filemap_write_and_wait_range(old_inode->v.i_mapping, - 0, LLONG_MAX); - - if (new_inode && S_ISDIR(old_inode->v.i_mode)) { - lockdep_assert_held(&new_inode->v.i_rwsem); - - if (!S_ISDIR(new_inode->v.i_mode)) - return -ENOTDIR; - - if (bch2_empty_dir(c, new_inode->v.i_ino)) - return -ENOTEMPTY; - - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE); - if (unlikely(ret)) - return ret; - - clear_nlink(&new_inode->v); - bch2_dec_nlink(c, old_dir); - } else if (new_inode) { - lockdep_assert_held(&new_inode->v.i_rwsem); - - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE); - if (unlikely(ret)) - return ret; - - new_inode->v.i_ctime = now; - bch2_dec_nlink(c, new_inode); - } else if (S_ISDIR(old_inode->v.i_mode)) { - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME); - if (unlikely(ret)) - return ret; - - bch2_inc_nlink(c, new_dir); - bch2_dec_nlink(c, old_dir); - } else { - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME); - if (unlikely(ret)) - return ret; + if (inode == info->src_dir) { + bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode); + bi->bi_nlink += info->dst_inode && + S_ISDIR(info->dst_inode->v.i_mode) && + info->mode == BCH_RENAME_EXCHANGE; } - old_dir->v.i_ctime = old_dir->v.i_mtime = now; - new_dir->v.i_ctime = new_dir->v.i_mtime = now; - mark_inode_dirty_sync(&old_dir->v); - mark_inode_dirty_sync(&new_dir->v); + if (inode == info->dst_dir) { + bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode); + bi->bi_nlink -= info->dst_inode && + S_ISDIR(info->dst_inode->v.i_mode); + } - old_inode->v.i_ctime = now; - mark_inode_dirty_sync(&old_inode->v); + if (inode == info->dst_inode && + info->mode == BCH_RENAME_OVERWRITE) { + BUG_ON(bi->bi_nlink && + S_ISDIR(info->dst_inode->v.i_mode)); + + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; + } + + if (inode == info->src_dir || + inode == info->dst_dir) + bi->bi_mtime = info->now; + bi->bi_ctime = info->now; return 0; } -static int bch2_rename_exchange(struct bch_fs *c, - struct bch_inode_info *old_dir, - struct dentry *old_dentry, - struct bch_inode_info *new_dir, - struct dentry *new_dentry) -{ - struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); - struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); - struct timespec now = current_time(&old_dir->v); - int ret; - - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE); - if (unlikely(ret)) - return ret; - - if (S_ISDIR(old_inode->v.i_mode) != - S_ISDIR(new_inode->v.i_mode)) { - if (S_ISDIR(old_inode->v.i_mode)) { - bch2_inc_nlink(c, new_dir); - bch2_dec_nlink(c, old_dir); - } else { - bch2_dec_nlink(c, new_dir); - bch2_inc_nlink(c, old_dir); - } - } - - old_dir->v.i_ctime = old_dir->v.i_mtime = now; - new_dir->v.i_ctime = new_dir->v.i_mtime = now; - mark_inode_dirty_sync(&old_dir->v); - mark_inode_dirty_sync(&new_dir->v); - - old_inode->v.i_ctime = now; - new_inode->v.i_ctime = now; - mark_inode_dirty_sync(&old_inode->v); - mark_inode_dirty_sync(&new_inode->v); - - return 0; -} - -static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry, - struct inode *new_vdir, struct dentry *new_dentry, +static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, + struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { - struct bch_fs *c = old_vdir->i_sb->s_fs_info; - struct bch_inode_info *old_dir = to_bch_ei(old_vdir); - struct bch_inode_info *new_dir = to_bch_ei(new_vdir); + struct bch_fs *c = src_vdir->i_sb->s_fs_info; + struct rename_info i = { + .now = timespec_to_bch2_time(c, + current_time(src_vdir)), + .src_dir = to_bch_ei(src_vdir), + .dst_dir = to_bch_ei(dst_vdir), + .src_inode = to_bch_ei(src_dentry->d_inode), + .dst_inode = to_bch_ei(dst_dentry->d_inode), + .mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME, + }; + struct btree_trans trans; + struct bch_inode_unpacked dst_dir_u, src_dir_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u; + u64 journal_seq = 0; + int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) return -EINVAL; - if (flags & RENAME_EXCHANGE) - return bch2_rename_exchange(c, old_dir, old_dentry, - new_dir, new_dentry); + if (i.mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(i.src_inode->v.i_mode) != + S_ISDIR(i.dst_inode->v.i_mode)) + return -ENOTDIR; - return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry); + if (S_ISDIR(i.src_inode->v.i_mode) && + bch2_empty_dir(c, i.dst_inode->v.i_ino)) + return -ENOTEMPTY; + + ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping, + 0, LLONG_MAX); + if (ret) + return ret; + } + + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); + i.now = timespec_to_bch2_time(c, current_time(src_vdir)), + + ret = bch2_dirent_rename(&trans, + i.src_dir, &src_dentry->d_name, + i.dst_dir, &dst_dentry->d_name, + i.mode) ?: + bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u, + inode_update_for_rename_fn, &i) ?: + (i.src_dir != i.dst_dir + ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u, + inode_update_for_rename_fn, &i) + : 0 ) ?: + bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u, + inode_update_for_rename_fn, &i) ?: + (i.dst_inode + ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u, + inode_update_for_rename_fn, &i) + : 0 ) ?: + bch2_trans_commit(&trans, NULL, NULL, + &journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) + goto err; + + bch2_inode_update_after_write(c, i.src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(i.src_dir, journal_seq); + + if (i.src_dir != i.dst_dir) { + bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(i.dst_dir, journal_seq); + } + + bch2_inode_update_after_write(c, i.src_inode, &src_inode_u, + ATTR_CTIME); + if (i.dst_inode) + bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u, + ATTR_CTIME); +err: + bch2_trans_exit(&trans); + + return ret; +} + +static int inode_update_for_setattr_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct iattr *attr = p; + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid); + if (ia_valid & ATTR_GID) + bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid); + + if (ia_valid & ATTR_ATIME) + bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) + bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); + + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + kgid_t gid = ia_valid & ATTR_GID + ? attr->ia_gid + : inode->v.i_gid; + + if (!in_group_p(gid) && + !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) + mode &= ~S_ISGID; + bi->bi_mode = mode; + } + + return 0; } static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid = inode->ei_qid; + struct btree_trans trans; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl = NULL; unsigned qtypes = 0; int ret; @@ -654,19 +835,39 @@ static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iatt inode->v.i_blocks + inode->ei_quota_reserved); if (ret) - goto out_unlock; + goto err; } - setattr_copy(&inode->v, iattr); + bch2_trans_init(&trans, c); +retry: + bch2_trans_begin(&trans); + kfree(acl); + acl = NULL; - ret = bch2_write_inode(c, inode); -out_unlock: + ret = bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_setattr_fn, iattr) ?: + (iattr->ia_valid & ATTR_MODE + ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl) + : 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) + goto err_trans; + + bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid); + + if (acl) + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +err_trans: + bch2_trans_exit(&trans); +err: mutex_unlock(&inode->ei_update_lock); - if (!ret && - iattr->ia_valid & ATTR_MODE) - ret = posix_acl_chmod(&inode->v, inode->v.i_mode); - return ret; } @@ -723,16 +924,14 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) { - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode; + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); - /* XXX: i_nlink should be 0? */ - inode = bch2_vfs_inode_create(c, dir, mode, 0); - if (unlikely(IS_ERR(inode))) + if (IS_ERR(inode)) return PTR_ERR(inode); - d_tmpfile(dentry, &inode->v); + d_mark_tmpfile(dentry, &inode->v); + d_instantiate(dentry, &inode->v); return 0; } @@ -987,24 +1186,17 @@ static void bch2_vfs_inode_init(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_unpacked *bi) { - inode->v.i_mode = bi->bi_mode; - i_uid_write(&inode->v, bi->bi_uid); - i_gid_write(&inode->v, bi->bi_gid); + bch2_inode_update_after_write(c, inode, bi, ~0); + inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; - set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode)); inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; - inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); - inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); - inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; - inode->ei_qid = bch_qid(bi); inode->ei_str_hash = bch2_hash_info_init(c, bi); - inode->ei_inode = *bi; bch2_inode_flags_to_vfs(inode); @@ -1059,6 +1251,19 @@ static void bch2_destroy_inode(struct inode *vinode) call_rcu(&vinode->i_rcu, bch2_i_callback); } +static int inode_update_times_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); + bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); + bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); + + return 0; +} + static int bch2_vfs_write_inode(struct inode *vinode, struct writeback_control *wbc) { @@ -1067,7 +1272,8 @@ static int bch2_vfs_write_inode(struct inode *vinode, int ret; mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode); + ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); if (c->opts.journal_flush_disabled) @@ -1096,7 +1302,9 @@ static void bch2_evict_inode(struct inode *vinode) bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, BCH_QUOTA_WARN); bch2_inode_rm(c, inode->v.i_ino); - atomic_long_dec(&c->nr_inodes); + + WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0, + "nr_inodes < 0"); } } diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index fbbc7a3a..e2fc2706 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -51,8 +51,16 @@ struct bch_inode_unpacked; typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); +void bch2_inode_update_after_write(struct bch_fs *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + unsigned); +int __must_check bch2_write_inode_trans(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + inode_set_fn, void *); int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *); + inode_set_fn, void *, unsigned); int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index edf714f7..f6035cc7 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -126,16 +126,22 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum) struct hash_check { struct bch_hash_info info; - struct btree_iter chain; - struct btree_iter iter; + struct btree_trans *trans; + + /* start of current chain of hash collisions: */ + struct btree_iter *chain; + + /* next offset in current chain of hash collisions: */ u64 next; }; static void hash_check_init(const struct bch_hash_desc desc, - struct hash_check *h, struct bch_fs *c) + struct btree_trans *trans, + struct hash_check *h) { - bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0); - bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0); + h->trans = trans; + h->chain = bch2_trans_get_iter(trans, desc.btree_id, POS_MIN, 0); + h->next = -1; } static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c, @@ -173,6 +179,75 @@ err: return ret; } +/* fsck hasn't been converted to new transactions yet: */ +static int fsck_hash_delete_at(const struct bch_hash_desc desc, + struct bch_hash_info *info, + struct btree_iter *orig_iter) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret; + + bch2_btree_iter_unlock(orig_iter); + + bch2_trans_init(&trans, orig_iter->c); +retry: + bch2_trans_begin(&trans); + + iter = bch2_trans_copy_iter(&trans, orig_iter); + if (IS_ERR(iter)) { + ret = PTR_ERR(iter); + goto err; + } + + ret = bch2_hash_delete_at(&trans, desc, info, iter) ?: + bch2_trans_commit(&trans, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); +err: + if (ret == -EINTR) + goto retry; + + bch2_trans_exit(&trans); + return ret; +} + +static int hash_check_duplicates(const struct bch_hash_desc desc, + struct hash_check *h, struct bch_fs *c, + struct btree_iter *k_iter, struct bkey_s_c k) +{ + struct btree_iter *iter; + struct bkey_s_c k2; + char buf[200]; + int ret = 0; + + if (!bkey_cmp(h->chain->pos, k_iter->pos)) + return 0; + + iter = bch2_trans_copy_iter(h->trans, h->chain); + BUG_ON(IS_ERR(iter)); + + for_each_btree_key_continue(iter, 0, k2) { + if (bkey_cmp(k2.k->p, k.k->p) >= 0) + break; + + if (fsck_err_on(k2.k->type == desc.key_type && + !desc.cmp_bkey(k, k2), c, + "duplicate hash table keys:\n%s", + (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), + buf, sizeof(buf), k), buf))) { + ret = fsck_hash_delete_at(desc, &h->info, k_iter); + if (ret) + return ret; + ret = 1; + break; + } + } +fsck_err: + bch2_trans_iter_free(h->trans, iter); + return ret; +} + static int hash_check_key(const struct bch_hash_desc desc, struct hash_check *h, struct bch_fs *c, struct btree_iter *k_iter, struct bkey_s_c k) @@ -185,13 +260,8 @@ static int hash_check_key(const struct bch_hash_desc desc, k.k->type != desc.key_type) return 0; - if (k.k->p.offset != h->next) { - if (!btree_iter_linked(&h->chain)) { - bch2_btree_iter_link(k_iter, &h->chain); - bch2_btree_iter_link(k_iter, &h->iter); - } - bch2_btree_iter_copy(&h->chain, k_iter); - } + if (k.k->p.offset != h->next) + bch2_btree_iter_copy(h->chain, k_iter); h->next = k.k->p.offset + 1; if (k.k->type != desc.key_type) @@ -199,11 +269,11 @@ static int hash_check_key(const struct bch_hash_desc desc, hashed = desc.hash_bkey(&h->info, k); - if (fsck_err_on(hashed < h->chain.pos.offset || + if (fsck_err_on(hashed < h->chain->pos.offset || hashed > k.k->p.offset, c, "hash table key at wrong offset: %llu, " "hashed to %llu chain starts at %llu\n%s", - k.k->p.offset, hashed, h->chain.pos.offset, + k.k->p.offset, hashed, h->chain->pos.offset, (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), buf, sizeof(buf), k), buf))) { ret = hash_redo_key(desc, h, c, k_iter, k, hashed); @@ -214,25 +284,7 @@ static int hash_check_key(const struct bch_hash_desc desc, return 1; } - if (!bkey_cmp(h->chain.pos, k_iter->pos)) - return 0; - - bch2_btree_iter_copy(&h->iter, &h->chain); - while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) { - struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter); - - if (fsck_err_on(k2.k->type == desc.key_type && - !desc.cmp_bkey(k, k2), c, - "duplicate hash table keys:\n%s", - (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), - buf, sizeof(buf), k), buf))) { - ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL); - if (ret) - return ret; - return 1; - } - bch2_btree_iter_next(&h->iter); - } + ret = hash_check_duplicates(desc, h, c, k_iter, k); fsck_err: return ret; } @@ -250,6 +302,8 @@ static int check_extents(struct bch_fs *c) u64 i_sectors; int ret = 0; + bch_verbose(c, "checking extents"); + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(BCACHEFS_ROOT_INO, 0), 0, k) { ret = walk_inode(c, &w, k.k->p.inode); @@ -332,16 +386,25 @@ static int check_dirents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct hash_check h; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; unsigned name_len; char buf[200]; int ret = 0; - hash_check_init(bch2_dirent_hash_desc, &h, c); + bch_verbose(c, "checking dirents"); - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(BCACHEFS_ROOT_INO, 0), 0, k) { + bch2_trans_init(&trans, c); + + BUG_ON(bch2_trans_preload_iters(&trans)); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + POS(BCACHEFS_ROOT_INO, 0), 0); + + hash_check_init(bch2_dirent_hash_desc, &trans, &h); + + for_each_btree_key_continue(iter, 0, k) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; @@ -360,7 +423,7 @@ static int check_dirents(struct bch_fs *c) mode_to_type(w.inode.bi_mode), (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, buf, sizeof(buf), k), buf))) { - ret = bch2_btree_delete_at(&iter, 0); + ret = bch2_btree_delete_at(iter, 0); if (ret) goto err; continue; @@ -369,7 +432,7 @@ static int check_dirents(struct bch_fs *c) if (w.first_this_inode && w.have_inode) hash_check_set_inode(&h, c, &w.inode); - ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k); + ret = hash_check_key(bch2_dirent_hash_desc, &h, c, iter, k); if (ret > 0) { ret = 0; continue; @@ -393,7 +456,7 @@ static int check_dirents(struct bch_fs *c) fsck_err_on(name_len == 2 && !memcmp(d.v->d_name, "..", 2), c, ".. dirent")) { - ret = remove_dirent(c, &iter, d); + ret = remove_dirent(c, iter, d); if (ret) goto err; continue; @@ -403,7 +466,7 @@ static int check_dirents(struct bch_fs *c) "dirent points to own directory:\n%s", (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, buf, sizeof(buf), k), buf))) { - ret = remove_dirent(c, &iter, d); + ret = remove_dirent(c, iter, d); if (ret) goto err; continue; @@ -420,7 +483,7 @@ static int check_dirents(struct bch_fs *c) "dirent points to missing inode:\n%s", (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, buf, sizeof(buf), k), buf))) { - ret = remove_dirent(c, &iter, d); + ret = remove_dirent(c, iter, d); if (ret) goto err; continue; @@ -446,7 +509,7 @@ static int check_dirents(struct bch_fs *c) ret = bch2_btree_insert_at(c, NULL, NULL, NULL, BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &n->k_i)); + BTREE_INSERT_ENTRY(iter, &n->k_i)); kfree(n); if (ret) goto err; @@ -455,9 +518,7 @@ static int check_dirents(struct bch_fs *c) } err: fsck_err: - bch2_btree_iter_unlock(&h.chain); - bch2_btree_iter_unlock(&h.iter); - return bch2_btree_iter_unlock(&iter) ?: ret; + return bch2_trans_exit(&trans) ?: ret; } /* @@ -468,14 +529,23 @@ static int check_xattrs(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct hash_check h; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; int ret = 0; - hash_check_init(bch2_xattr_hash_desc, &h, c); + bch_verbose(c, "checking xattrs"); - for_each_btree_key(&iter, c, BTREE_ID_XATTRS, - POS(BCACHEFS_ROOT_INO, 0), 0, k) { + bch2_trans_init(&trans, c); + + BUG_ON(bch2_trans_preload_iters(&trans)); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, + POS(BCACHEFS_ROOT_INO, 0), 0); + + hash_check_init(bch2_xattr_hash_desc, &trans, &h); + + for_each_btree_key_continue(iter, 0, k) { ret = walk_inode(c, &w, k.k->p.inode); if (ret) break; @@ -483,7 +553,7 @@ static int check_xattrs(struct bch_fs *c) if (fsck_err_on(!w.have_inode, c, "xattr for missing inode %llu", k.k->p.inode)) { - ret = bch2_btree_delete_at(&iter, 0); + ret = bch2_btree_delete_at(iter, 0); if (ret) goto err; continue; @@ -492,15 +562,13 @@ static int check_xattrs(struct bch_fs *c) if (w.first_this_inode && w.have_inode) hash_check_set_inode(&h, c, &w.inode); - ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k); + ret = hash_check_key(bch2_xattr_hash_desc, &h, c, iter, k); if (ret) goto fsck_err; } err: fsck_err: - bch2_btree_iter_unlock(&h.chain); - bch2_btree_iter_unlock(&h.iter); - return bch2_btree_iter_unlock(&iter) ?: ret; + return bch2_trans_exit(&trans) ?: ret; } /* Get root directory, create if it doesn't exist: */ @@ -509,6 +577,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) struct bkey_inode_buf packed; int ret; + bch_verbose(c, "checking root directory"); + ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); if (ret && ret != -ENOENT) return ret; @@ -546,6 +616,8 @@ static int check_lostfound(struct bch_fs *c, u64 inum; int ret; + bch_verbose(c, "checking lost+found"); + inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, &lostfound); if (!inum) { @@ -672,6 +744,8 @@ static int check_directory_structure(struct bch_fs *c, u64 d_inum; int ret = 0; + bch_verbose(c, "checking directory structure"); + /* DFS: */ restart_dfs: had_unreachable = false; @@ -872,15 +946,116 @@ s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum) return bch2_btree_iter_unlock(&iter) ?: sectors; } -static int bch2_gc_do_inode(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - struct btree_iter *iter, - struct bkey_s_c_inode inode, struct nlink link) +static int check_inode_nlink(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + struct bch_inode_unpacked *u, + struct nlink *link, + bool *do_update) +{ + u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED + ? 0 + : u->bi_nlink + nlink_bias(u->bi_mode); + u32 real_i_nlink = + link->count * nlink_bias(u->bi_mode) + + link->dir_count; + int ret = 0; + + /* + * These should have been caught/fixed by earlier passes, we don't + * repair them here: + */ + if (S_ISDIR(u->bi_mode) && link->count > 1) { + need_fsck_err(c, "directory %llu with multiple hardlinks: %u", + u->bi_inum, link->count); + return 0; + } + + if (S_ISDIR(u->bi_mode) && !link->count) { + need_fsck_err(c, "unreachable directory found (inum %llu)", + u->bi_inum); + return 0; + } + + if (!S_ISDIR(u->bi_mode) && link->dir_count) { + need_fsck_err(c, "non directory with subdirectories", + u->bi_inum); + return 0; + } + + if (!link->count && + !(u->bi_flags & BCH_INODE_UNLINKED) && + (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", + u->bi_inum, mode_to_type(u->bi_mode)) == + FSCK_ERR_IGNORE) + return 0; + + ret = reattach_inode(c, lostfound_inode, u->bi_inum); + if (ret) + return ret; + + link->count = 1; + real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; + goto set_i_nlink; + } + + if (i_nlink < link->count) { + if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", + u->bi_inum, i_nlink, link->count, + mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; + } + + if (i_nlink != real_i_nlink && + c->sb.clean) { + if (fsck_err(c, "filesystem marked clean, " + "but inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", + u->bi_inum, mode_to_type(u->bi_mode), + i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; + } + + if (i_nlink != real_i_nlink && + (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + if (fsck_err(c, "inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", + u->bi_inum, mode_to_type(u->bi_mode), + i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; + } + + if (real_i_nlink && i_nlink != real_i_nlink) + bch_verbose(c, "setting inode %llu nlink from %u to %u", + u->bi_inum, i_nlink, real_i_nlink); +set_i_nlink: + if (i_nlink != real_i_nlink) { + if (real_i_nlink) { + u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode); + u->bi_flags &= ~BCH_INODE_UNLINKED; + } else { + u->bi_nlink = 0; + u->bi_flags |= BCH_INODE_UNLINKED; + } + + *do_update = true; + } +fsck_err: + return ret; +} + +static int check_inode(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + struct btree_iter *iter, + struct bkey_s_c_inode inode, + struct nlink *link) { struct bch_inode_unpacked u; - int ret = 0; - u32 i_nlink, real_i_nlink; bool do_update = false; + int ret = 0; ret = bch2_inode_unpack(inode, &u); if (bch2_fs_inconsistent_on(ret, c, @@ -888,48 +1063,17 @@ static int bch2_gc_do_inode(struct bch_fs *c, inode.k->p.inode)) return ret; - i_nlink = u.bi_nlink + nlink_bias(u.bi_mode); - - fsck_err_on(i_nlink < link.count, c, - "inode %llu i_link too small (%u < %u, type %i)", - inode.k->p.inode, i_nlink, - link.count, mode_to_type(u.bi_mode)); - - /* These should have been caught/fixed by earlier passes: */ - if (S_ISDIR(u.bi_mode)) { - need_fsck_err_on(link.count > 1, c, - "directory %llu with multiple hardlinks: %u", - inode.k->p.inode, link.count); - - real_i_nlink = link.count * 2 + link.dir_count; - } else { - need_fsck_err_on(link.dir_count, c, - "found dirents for non directory %llu", - inode.k->p.inode); - - real_i_nlink = link.count + link.dir_count; + if (link) { + ret = check_inode_nlink(c, lostfound_inode, &u, link, + &do_update); + if (ret) + return ret; } - if (!link.count) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but found orphaned inode %llu", - inode.k->p.inode); + if (u.bi_flags & BCH_INODE_UNLINKED) { + bch_verbose(c, "deleting inode %llu", u.bi_inum); - if (fsck_err_on(S_ISDIR(u.bi_mode) && - bch2_empty_dir(c, inode.k->p.inode), c, - "non empty directory with link count 0, " - "inode nlink %u, dir links found %u", - i_nlink, link.dir_count)) { - ret = reattach_inode(c, lostfound_inode, - inode.k->p.inode); - if (ret) - return ret; - } - - bch_verbose(c, "deleting inode %llu", inode.k->p.inode); - - ret = bch2_inode_rm(c, inode.k->p.inode); + ret = bch2_inode_rm(c, u.bi_inum); if (ret) bch_err(c, "error in fs gc: error %i " "while deleting inode", ret); @@ -940,16 +1084,16 @@ static int bch2_gc_do_inode(struct bch_fs *c, fsck_err_on(c->sb.clean, c, "filesystem marked clean, " "but inode %llu has i_size dirty", - inode.k->p.inode); + u.bi_inum); - bch_verbose(c, "truncating inode %llu", inode.k->p.inode); + bch_verbose(c, "truncating inode %llu", u.bi_inum); /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away */ - ret = bch2_inode_truncate(c, inode.k->p.inode, + ret = bch2_inode_truncate(c, u.bi_inum, round_up(u.bi_size, PAGE_SIZE) >> 9, NULL, NULL); if (ret) { @@ -974,12 +1118,12 @@ static int bch2_gc_do_inode(struct bch_fs *c, fsck_err_on(c->sb.clean, c, "filesystem marked clean, " "but inode %llu has i_sectors dirty", - inode.k->p.inode); + u.bi_inum); bch_verbose(c, "recounting sectors for inode %llu", - inode.k->p.inode); + u.bi_inum); - sectors = bch2_count_inode_sectors(c, inode.k->p.inode); + sectors = bch2_count_inode_sectors(c, u.bi_inum); if (sectors < 0) { bch_err(c, "error in fs gc: error %i " "recounting inode sectors", @@ -992,20 +1136,6 @@ static int bch2_gc_do_inode(struct bch_fs *c, do_update = true; } - if (i_nlink != real_i_nlink) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has wrong i_nlink " - "(type %u i_nlink %u, should be %u)", - inode.k->p.inode, mode_to_type(u.bi_mode), - i_nlink, real_i_nlink); - - bch_verbose(c, "setting inode %llu nlinks from %u to %u", - inode.k->p.inode, i_nlink, real_i_nlink); - u.bi_nlink = real_i_nlink - nlink_bias(u.bi_mode); - do_update = true; - } - if (do_update) { struct bkey_inode_buf p; @@ -1024,9 +1154,9 @@ fsck_err: noinline_for_stack static int bch2_gc_walk_inodes(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - nlink_table *links, - u64 range_start, u64 range_end) + struct bch_inode_unpacked *lostfound_inode, + nlink_table *links, + u64 range_start, u64 range_end) { struct btree_iter iter; struct bkey_s_c k; @@ -1065,10 +1195,9 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); */ bch2_btree_iter_unlock(&iter); - ret = bch2_gc_do_inode(c, lostfound_inode, &iter, - bkey_s_c_to_inode(k), *link); - if (ret == -EINTR) - continue; + ret = check_inode(c, lostfound_inode, &iter, + bkey_s_c_to_inode(k), link); + BUG_ON(ret == -EINTR); if (ret) break; @@ -1103,6 +1232,8 @@ static int check_inode_nlinks(struct bch_fs *c, u64 this_iter_range_start, next_iter_range_start = 0; int ret = 0; + bch_verbose(c, "checking inode nlinks"); + genradix_init(&links); do { @@ -1129,68 +1260,103 @@ static int check_inode_nlinks(struct bch_fs *c, return ret; } +noinline_for_stack +static int check_inodes_fast(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_inode inode; + unsigned long nr_inodes = 0; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) { + if (k.k->type != BCH_INODE_FS) + continue; + + inode = bkey_s_c_to_inode(k); + + if (!(inode.v->bi_flags & BCH_INODE_UNLINKED)) + nr_inodes++; + + if (inode.v->bi_flags & + (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED)) { + fsck_err_on(c->sb.clean, c, + "filesystem marked clean but found inode %llu with flags %x", + inode.k->p.inode, inode.v->bi_flags); + ret = check_inode(c, NULL, &iter, inode, NULL); + BUG_ON(ret == -EINTR); + if (ret) + break; + } + } + atomic_long_set(&c->nr_inodes, nr_inodes); +fsck_err: + return bch2_btree_iter_unlock(&iter) ?: ret; +} + /* * Checks for inconsistencies that shouldn't happen, unless we have a bug. * Doesn't fix them yet, mainly because they haven't yet been observed: */ -int bch2_fsck(struct bch_fs *c, bool full_fsck) +static int bch2_fsck_full(struct bch_fs *c) { struct bch_inode_unpacked root_inode, lostfound_inode; int ret; - if (full_fsck) { - bch_verbose(c, "checking extents"); - ret = check_extents(c); - if (ret) - return ret; - - bch_verbose(c, "checking dirents"); - ret = check_dirents(c); - if (ret) - return ret; - - bch_verbose(c, "checking xattrs"); - ret = check_xattrs(c); - if (ret) - return ret; - - bch_verbose(c, "checking root directory"); - ret = check_root(c, &root_inode); - if (ret) - return ret; - - bch_verbose(c, "checking lost+found"); - ret = check_lostfound(c, &root_inode, &lostfound_inode); - if (ret) - return ret; - - bch_verbose(c, "checking directory structure"); - ret = check_directory_structure(c, &lostfound_inode); - if (ret) - return ret; - - bch_verbose(c, "checking inode nlinks"); - ret = check_inode_nlinks(c, &lostfound_inode); - if (ret) - return ret; - } else { - bch_verbose(c, "checking root directory"); - ret = check_root(c, &root_inode); - if (ret) - return ret; - - bch_verbose(c, "checking lost+found"); - ret = check_lostfound(c, &root_inode, &lostfound_inode); - if (ret) - return ret; - - bch_verbose(c, "checking inode nlinks"); - ret = check_inode_nlinks(c, &lostfound_inode); - if (ret) - return ret; - } + bch_verbose(c, "starting fsck:"); + ret = check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: + check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_directory_structure(c, &lostfound_inode) ?: + check_inode_nlinks(c, &lostfound_inode); bch2_flush_fsck_errs(c); + bch_verbose(c, "fsck done"); - return 0; + return ret; +} + +static int bch2_fsck_inode_nlink(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + int ret; + + bch_verbose(c, "checking inode link counts:"); + ret = check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_inode_nlinks(c, &lostfound_inode); + + bch2_flush_fsck_errs(c); + bch_verbose(c, "done"); + + return ret; +} + +static int bch2_fsck_walk_inodes_only(struct bch_fs *c) +{ + int ret; + + bch_verbose(c, "walking inodes:"); + ret = check_inodes_fast(c); + + bch2_flush_fsck_errs(c); + bch_verbose(c, "done"); + + return ret; +} + +int bch2_fsck(struct bch_fs *c) +{ + if (!c->opts.nofsck) + return bch2_fsck_full(c); + + if (!c->sb.clean && + !(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) + return bch2_fsck_inode_nlink(c); + + return bch2_fsck_walk_inodes_only(c); } diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h index f9af1305..bc9caaf2 100644 --- a/libbcachefs/fsck.h +++ b/libbcachefs/fsck.h @@ -2,6 +2,6 @@ #define _BCACHEFS_FSCK_H s64 bch2_count_inode_sectors(struct bch_fs *, u64); -int bch2_fsck(struct bch_fs *, bool); +int bch2_fsck(struct bch_fs *); #endif /* _BCACHEFS_FSCK_H */ diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 3ae5ac97..d4139faa 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -203,6 +203,10 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) return "invalid data checksum type"; + if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && + unpacked.bi_nlink != 0) + return "flagged as unlinked but bi_nlink != 0"; + return NULL; } case BCH_INODE_BLOCKDEV: @@ -276,12 +280,27 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, } } -int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) +static inline u32 bkey_generation(struct bkey_s_c k) { - struct bkey_inode_buf inode_p; - struct btree_iter iter; - bool searched_from_start = false; + switch (k.k->type) { + case BCH_INODE_BLOCKDEV: + case BCH_INODE_FS: + BUG(); + case BCH_INODE_GENERATION: + return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); + default: + return 0; + } +} + +int __bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) +{ + struct bch_fs *c = trans->c; + struct bkey_inode_buf *inode_p; + struct btree_iter *iter; + u64 start; int ret; if (!max) @@ -290,82 +309,66 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, if (c->opts.inodes_32bit) max = min_t(u64, max, U32_MAX); - if (*hint >= max || *hint < min) - *hint = min; + start = READ_ONCE(*hint); - if (*hint == min) - searched_from_start = true; + if (start >= max || start < min) + start = min; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + iter = bch2_trans_get_iter(trans, + BTREE_ID_INODES, POS(start, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); again: - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - u32 bi_generation = 0; + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ret = btree_iter_err(k); - if (ret) { - bch2_btree_iter_unlock(&iter); + if (ret) return ret; - } switch (k.k->type) { case BCH_INODE_BLOCKDEV: case BCH_INODE_FS: /* slot used */ - if (iter.pos.inode == max) + if (iter->pos.inode >= max) goto out; - bch2_btree_iter_next_slot(&iter); + bch2_btree_iter_next_slot(iter); break; - case BCH_INODE_GENERATION: { - struct bkey_s_c_inode_generation g = - bkey_s_c_to_inode_generation(k); - bi_generation = le32_to_cpu(g.v->bi_generation); - /* fallthrough: */ - } default: - inode_u->bi_generation = bi_generation; - - bch2_inode_pack(&inode_p, inode_u); - inode_p.inode.k.p = k.k->p; - - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, - &inode_p.inode.k_i)); - - if (ret != -EINTR) { - bch2_btree_iter_unlock(&iter); - - if (!ret) { - inode_u->bi_inum = - inode_p.inode.k.p.inode; - *hint = inode_p.inode.k.p.inode + 1; - } - - return ret; - } - - if (ret == -EINTR) - continue; + *hint = k.k->p.inode; + inode_u->bi_inum = k.k->p.inode; + inode_u->bi_generation = bkey_generation(k); + bch2_inode_pack(inode_p, inode_u); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; } } out: - bch2_btree_iter_unlock(&iter); - - if (!searched_from_start) { + if (start != min) { /* Retry from start */ - *hint = min; - searched_from_start = true; + start = min; + bch2_btree_iter_set_pos(iter, POS(start, 0)); goto again; } return -ENOSPC; } +int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) +{ + return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, + __bch2_inode_create(&trans, inode_u, min, max, hint)); +} + int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, struct extent_insert_hook *hook, u64 *journal_seq) { diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 26461063..a47194ab 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -38,8 +38,13 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); + +int __bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *, + u64, u64, u64 *); int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, u64, u64, u64 *); + int bch2_inode_truncate(struct bch_fs *, u64, u64, struct extent_insert_hook *, u64 *); int bch2_inode_rm(struct bch_fs *, u64); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 58aee7ae..0af136d6 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -5,6 +5,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" +#include "dirent.h" #include "error.h" #include "fsck.h" #include "journal_io.h" @@ -14,6 +15,8 @@ #include +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + struct bkey_i *btree_root_find(struct bch_fs *c, struct bch_sb_field_clean *clean, struct jset *j, @@ -233,7 +236,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_fs_journal_start(&c->journal); err = "error starting allocator"; - if (bch2_fs_allocator_start(c)) + ret = bch2_fs_allocator_start(c); + if (ret) goto err; bch_verbose(c, "starting journal replay:"); @@ -246,12 +250,16 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.norecovery) goto out; - bch_verbose(c, "starting fsck:"); err = "error in fsck"; - ret = bch2_fsck(c, !c->opts.nofsck); + ret = bch2_fsck(c); if (ret) goto err; - bch_verbose(c, "fsck done"); + + if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; + mutex_unlock(&c->sb_lock); + } if (enabled_qtypes(c)) { bch_verbose(c, "reading quotas:"); @@ -273,8 +281,10 @@ fsck_err: int bch2_fs_initialize(struct bch_fs *c) { - struct bch_inode_unpacked inode; + struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; + struct bch_hash_info root_hash_info; + struct qstr lostfound = QSTR("lost+found"); const char *err = "cannot allocate memory"; struct bch_dev *ca; LIST_HEAD(journal); @@ -307,21 +317,46 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_journal_set_replay_done(&c->journal); err = "error starting allocator"; - if (bch2_fs_allocator_start(c)) + ret = bch2_fs_allocator_start(c); + if (ret) goto err; - bch2_inode_init(c, &inode, 0, 0, + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); - inode.bi_inum = BCACHEFS_ROOT_INO; - - bch2_inode_pack(&packed_inode, &inode); + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_nlink++; /* lost+found */ + bch2_inode_pack(&packed_inode, &root_inode); err = "error creating root directory"; - if (bch2_btree_insert(c, BTREE_ID_INODES, - &packed_inode.inode.k_i, - NULL, NULL, NULL, 0)) + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, + NULL, NULL, NULL, 0); + if (ret) goto err; + bch2_inode_init(c, &lostfound_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, + &root_inode); + lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1; + bch2_inode_pack(&packed_inode, &lostfound_inode); + + err = "error creating lost+found"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, + NULL, NULL, NULL, 0); + if (ret) + goto err; + + root_hash_info = bch2_hash_info_init(c, &root_inode); + + ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, + &lostfound, lostfound_inode.bi_inum, NULL, + BTREE_INSERT_NOFAIL); + if (ret) + goto err; + + atomic_long_set(&c->nr_inodes, 2); + if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); if (ret) @@ -329,12 +364,14 @@ int bch2_fs_initialize(struct bch_fs *c) } err = "error writing first journal entry"; - if (bch2_journal_meta(&c->journal)) + ret = bch2_journal_meta(&c->journal); + if (ret) goto err; mutex_lock(&c->sb_lock); SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index c8051095..99f1fe87 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -125,21 +125,29 @@ struct bch_hash_desc { bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); }; -static inline struct bkey_s_c -bch2_hash_lookup_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, const void *search) +static inline struct btree_iter * +bch2_hash_lookup(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key, + unsigned flags) { - u64 inode = iter->pos.inode; + struct btree_iter *iter; struct bkey_s_c k; + iter = bch2_trans_get_iter(trans, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|flags); + if (IS_ERR(iter)) + return iter; + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { if (iter->pos.inode != inode) break; if (k.k->type == desc.key_type) { - if (!desc.cmp_key(k, search)) - return k; + if (!desc.cmp_key(k, key)) + return iter; } else if (k.k->type == desc.whiteout_type) { ; } else { @@ -147,97 +155,48 @@ bch2_hash_lookup_at(const struct bch_hash_desc desc, break; } } - return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); + + return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT); } -static inline struct bkey_s_c -bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, struct bkey_s_c search) +static inline struct btree_iter * +bch2_hash_hole(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key) { - u64 inode = iter->pos.inode; + struct btree_iter *iter; struct bkey_s_c k; - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { - if (iter->pos.inode != inode) - break; - - if (k.k->type == desc.key_type) { - if (!desc.cmp_bkey(k, search)) - return k; - } else if (k.k->type == desc.whiteout_type) { - ; - } else { - /* hole, not found */ - break; - } - } - return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); -} - -static inline struct bkey_s_c -bch2_hash_lookup(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, const void *key) -{ - bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS); - - return bch2_hash_lookup_at(desc, info, iter, key); -} - -static inline struct bkey_s_c -bch2_hash_lookup_intent(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, const void *key) -{ - bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - return bch2_hash_lookup_at(desc, info, iter, key); -} - -static inline struct bkey_s_c -bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter) -{ - u64 inode = iter->pos.inode; - struct bkey_s_c k; + iter = bch2_trans_get_iter(trans, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return iter; for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { if (iter->pos.inode != inode) break; if (k.k->type != desc.key_type) - return k; + return iter; } - return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); + + return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC); } -static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, - const void *key) -{ - bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - return bch2_hash_hole_at(desc, iter); -} - -static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, +static inline int bch2_hash_needs_whiteout(struct btree_trans *trans, + const struct bch_hash_desc desc, const struct bch_hash_info *info, - struct btree_iter *iter, struct btree_iter *start) { + struct btree_iter *iter; struct bkey_s_c k; - bch2_btree_iter_copy(iter, start); + iter = bch2_trans_copy_iter(trans, start); + if (IS_ERR(iter)) + return PTR_ERR(iter); + bch2_btree_iter_next_slot(iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { @@ -252,142 +211,108 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, return btree_iter_err(k); } +static inline int __bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, struct bkey_i *insert, int flags) +{ + struct btree_iter *iter, *slot = NULL; + struct bkey_s_c k; + + iter = bch2_trans_get_iter(trans, desc.btree_id, + POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + if (iter->pos.inode != inode) + break; + + if (k.k->type == desc.key_type) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; + + /* hash collision: */ + continue; + } + + if (!slot && + !(flags & BCH_HASH_SET_MUST_REPLACE)) { + slot = bch2_trans_copy_iter(trans, iter); + if (IS_ERR(slot)) + return PTR_ERR(slot); + } + + if (k.k->type != desc.whiteout_type) + goto not_found; + } + + return btree_iter_err(k) ?: -ENOSPC; +not_found: + if (flags & BCH_HASH_SET_MUST_REPLACE) + return -ENOENT; + + insert->k.p = slot->pos; + bch2_trans_update(trans, slot, insert, 0); + return 0; +found: + if (flags & BCH_HASH_SET_MUST_CREATE) + return -EEXIST; + + insert->k.p = iter->pos; + bch2_trans_update(trans, iter, insert, 0); + return 0; +} + static inline int bch2_hash_set(const struct bch_hash_desc desc, const struct bch_hash_info *info, struct bch_fs *c, u64 inode, u64 *journal_seq, struct bkey_i *insert, int flags) { - struct btree_iter iter, hashed_slot; - struct bkey_s_c k; + return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC, + __bch2_hash_set(&trans, desc, info, + inode, insert, flags)); +} + +static inline int bch2_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter) +{ + struct bkey_i *delete; int ret; - bch2_btree_iter_init(&hashed_slot, c, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_link(&hashed_slot, &iter); -retry: - /* - * On hash collision, we have to keep the slot we hashed to locked while - * we do the insert - to avoid racing with another thread deleting - * whatever's in the slot we hashed to: - */ - ret = bch2_btree_iter_traverse(&hashed_slot); - if (ret) - goto err; - - /* - * On -EINTR/retry, we dropped locks - always restart from the slot we - * hashed to: - */ - bch2_btree_iter_copy(&iter, &hashed_slot); - - k = bch2_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert)); - - ret = btree_iter_err(k); - if (ret == -ENOENT) { - if (flags & BCH_HASH_SET_MUST_REPLACE) { - ret = -ENOENT; - goto err; - } - - /* - * Not found, so we're now looking for any open - * slot - we might have skipped over a whiteout - * that we could have used, so restart from the - * slot we hashed to: - */ - bch2_btree_iter_copy(&iter, &hashed_slot); - k = bch2_hash_hole_at(desc, &iter); - if ((ret = btree_iter_err(k))) - goto err; - } else if (!ret) { - if (flags & BCH_HASH_SET_MUST_CREATE) { - ret = -EEXIST; - goto err; - } - } else { - goto err; - } - - insert->k.p = iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC|flags, - BTREE_INSERT_ENTRY(&iter, insert)); -err: - if (ret == -EINTR) - goto retry; - - /* - * On successful insert, we don't want to clobber ret with error from - * iter: - */ - bch2_btree_iter_unlock(&iter); - bch2_btree_iter_unlock(&hashed_slot); - return ret; -} - -static inline int bch2_hash_delete_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - u64 *journal_seq) -{ - struct btree_iter whiteout_iter; - struct bkey_i delete; - int ret = -ENOENT; - - bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id, - iter->pos, BTREE_ITER_SLOTS); - bch2_btree_iter_link(iter, &whiteout_iter); - - ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter); + ret = bch2_hash_needs_whiteout(trans, desc, info, iter); if (ret < 0) - goto err; + return ret; - bkey_init(&delete.k); - delete.k.p = iter->pos; - delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + if (IS_ERR(delete)) + return PTR_ERR(delete); - ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(iter, &delete)); -err: - bch2_btree_iter_unlink(&whiteout_iter); - return ret; + bkey_init(&delete->k); + delete->k.p = iter->pos; + delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; + + bch2_trans_update(trans, iter, delete, 0); + return 0; } -static inline int bch2_hash_delete(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - u64 *journal_seq, const void *key) +static inline int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key) { - struct btree_iter iter, whiteout_iter; - struct bkey_s_c k; - int ret = -ENOENT; + struct btree_iter *iter; - bch2_btree_iter_init(&iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS); - bch2_btree_iter_link(&iter, &whiteout_iter); -retry: - k = bch2_hash_lookup_at(desc, info, &iter, key); - if ((ret = btree_iter_err(k))) - goto err; + iter = bch2_hash_lookup(trans, desc, info, inode, key, + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); - ret = bch2_hash_delete_at(desc, info, &iter, journal_seq); -err: - if (ret == -EINTR) - goto retry; - - bch2_btree_iter_unlock(&whiteout_iter); - bch2_btree_iter_unlock(&iter); - return ret; + return bch2_hash_delete_at(trans, desc, info, iter); } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 24c6cc56..1272ea7a 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -52,7 +52,7 @@ static int __bch2_strtoh(const char *cp, u64 *res, cp++; } while (isdigit(*cp)); - for (u = 1; u < ARRAY_SIZE(si_units); u++) + for (u = 1; u < strlen(si_units); u++) if (*cp == si_units[u]) { cp++; goto got_unit; diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index c6b5015a..7d0fee3a 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -74,7 +74,6 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct xattr_handler *handler; struct bkey_s_c_xattr xattr; - unsigned u64s; switch (k.k->type) { case BCH_XATTR: @@ -82,13 +81,15 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) return "value too small"; xattr = bkey_s_c_to_xattr(k); - u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len)); - if (bkey_val_u64s(k.k) < u64s) + if (bkey_val_u64s(k.k) < + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len))) return "value too small"; - if (bkey_val_u64s(k.k) > u64s) + if (bkey_val_u64s(k.k) > + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4)) return "value too big"; handler = bch2_xattr_type_to_handler(xattr.v->x_type); @@ -142,32 +143,28 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf, } } -struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *c, - struct btree_iter *iter, - struct bch_inode_info *inode, - const char *name, int type) -{ - return bch2_hash_lookup(bch2_xattr_hash_desc, - &inode->ei_str_hash, - c, inode->v.i_ino, iter, - &X_SEARCH(type, name, strlen(name))); -} - int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) + const char *name, void *buffer, size_t size, int type) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c_xattr xattr; int ret; - k = bch2_hash_lookup(bch2_xattr_hash_desc, &inode->ei_str_hash, c, - inode->v.i_ino, &iter, - &X_SEARCH(type, name, strlen(name))); - if (IS_ERR(k.k)) - return bch2_btree_iter_unlock(&iter) ?: -ENODATA; + bch2_trans_init(&trans, c); - xattr = bkey_s_c_to_xattr(k); + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, + &inode->ei_str_hash, inode->v.i_ino, + &X_SEARCH(type, name, strlen(name)), + 0); + if (IS_ERR(iter)) { + bch2_trans_exit(&trans); + BUG_ON(PTR_ERR(iter) == -EINTR); + + return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); + } + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -176,47 +173,48 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, memcpy(buffer, xattr_val(xattr.v), ret); } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return ret; } -int bch2_xattr_set(struct bch_fs *c, u64 inum, +int bch2_xattr_set(struct btree_trans *trans, u64 inum, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, - int flags, int type, u64 *journal_seq) + int type, int flags) { - struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); int ret; if (value) { struct bkey_i_xattr *xattr; + unsigned namelen = strlen(name); unsigned u64s = BKEY_U64s + - xattr_val_u64s(search.name.len, size); + xattr_val_u64s(namelen, size); if (u64s > U8_MAX) return -ERANGE; - xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (!xattr) - return -ENOMEM; + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); bkey_xattr_init(&xattr->k_i); xattr->k.u64s = u64s; xattr->v.x_type = type; - xattr->v.x_name_len = search.name.len; + xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, search.name.name, search.name.len); + memcpy(xattr->v.x_name, name, namelen); memcpy(xattr_val(&xattr->v), value, size); - ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c, - inum, journal_seq, - &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); - kfree(xattr); + ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, + inum, &xattr->k_i, + (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| + (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); } else { - ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info, - c, inum, journal_seq, &search); + struct xattr_search_key search = + X_SEARCH(type, name, strlen(name)); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, + hash_info, inum, &search); } if (ret == -ENOENT) @@ -308,9 +306,11 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - return bch2_xattr_set(c, inode->v.i_ino, &inode->ei_str_hash, - name, value, size, flags, handler->flags, - &inode->ei_journal_seq); + return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC, + bch2_xattr_set(&trans, inode->v.i_ino, + &inode->ei_str_hash, + name, value, size, + handler->flags, flags)); } static const struct xattr_handler bch_xattr_user_handler = { @@ -433,7 +433,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s); + ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); mutex_unlock(&inode->ei_update_lock); if (value && diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 1365032d..0689d327 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -35,15 +35,12 @@ struct xattr_handler; struct bch_hash_info; struct bch_inode_info; -struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *, - struct btree_iter *, - struct bch_inode_info *, - const char *, int); int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *, - const char *, const void *, size_t, int, int, u64 *); +int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, + const char *, const void *, size_t, int, int); + ssize_t bch2_xattr_list(struct dentry *, char *, size_t); extern const struct xattr_handler *bch2_xattr_handlers[];