From 1f7098c22213bbe66896f390a529223468a3986e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 9 Nov 2019 22:49:03 -0500 Subject: [PATCH] Update bcachefs sources to b1a4dc53be bcachefs: Set lost+found mode to 0700 --- .bcachefs_revision | 2 +- libbcachefs/bkey.c | 26 ++- libbcachefs/bkey_sort.c | 8 +- libbcachefs/bset.c | 354 +++++++++++++++----------------------- libbcachefs/bset.h | 4 +- libbcachefs/btree_cache.c | 8 +- libbcachefs/btree_iter.c | 7 +- libbcachefs/buckets.c | 26 +-- libbcachefs/clock.c | 7 +- libbcachefs/clock.h | 13 +- libbcachefs/error.c | 13 +- libbcachefs/error.h | 1 + libbcachefs/extents.c | 35 ++-- libbcachefs/fs-io.c | 173 ++++++++++--------- libbcachefs/fs-io.h | 4 +- libbcachefs/fs.c | 59 ++++++- libbcachefs/fs.h | 37 +++- libbcachefs/fsck.c | 2 +- libbcachefs/io.c | 24 +-- libbcachefs/opts.h | 11 ++ libbcachefs/reflink.c | 4 +- 21 files changed, 426 insertions(+), 392 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 9676940a..e0172a41 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9e76e8d98c52c128641b0f916a1990a37d60d22e +b1a4dc53be10a4c3132fccaaf604d73861a52d2d diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index ed7ca5b0..4d0c9129 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -1058,26 +1058,20 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l, const struct bkey_packed *r, const struct btree *b) { - int packed = bkey_lr_packed(l, r); + struct bkey unpacked; - if (likely(packed == BKEY_PACKED_BOTH)) + if (likely(bkey_packed(l) && bkey_packed(r))) return __bch2_bkey_cmp_packed_format_checked(l, r, b); - switch (packed) { - case BKEY_PACKED_NONE: - return bkey_cmp(((struct bkey *) l)->p, - ((struct bkey *) r)->p); - case BKEY_PACKED_LEFT: - return __bch2_bkey_cmp_left_packed_format_checked(b, - (struct bkey_packed *) l, - &((struct bkey *) r)->p); - case BKEY_PACKED_RIGHT: - return -__bch2_bkey_cmp_left_packed_format_checked(b, - (struct bkey_packed *) r, - &((struct bkey *) l)->p); - default: - unreachable(); + if (bkey_packed(l)) { + __bkey_unpack_key_format_checked(b, &unpacked, l); + l = (void*) &unpacked; + } else if (bkey_packed(r)) { + __bkey_unpack_key_format_checked(b, &unpacked, r); + r = (void*) &unpacked; } + + return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); } __pure __flatten diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index e32fad5a..2cac269b 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -418,7 +418,7 @@ bch2_sort_repack_merge(struct bch_fs *c, struct bkey_packed *prev = NULL, *k_packed; struct bkey_s k; struct btree_nr_keys nr; - BKEY_PADDED(k) tmp; + struct bkey unpacked; memset(&nr, 0, sizeof(nr)); @@ -426,11 +426,7 @@ bch2_sort_repack_merge(struct bch_fs *c, if (filter_whiteouts && bkey_whiteout(k_packed)) continue; - EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) > - BKEY_EXTENT_VAL_U64s_MAX); - - bch2_bkey_unpack(src, &tmp.k, k_packed); - k = bkey_i_to_s(&tmp.k); + k = __bkey_disassemble(src, k_packed, &unpacked); if (filter_whiteouts && bch2_bkey_normalize(c, k)) diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 19f13b7e..b7618e2b 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -294,38 +294,23 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, /* Auxiliary search trees */ -#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) -#define BFLOAT_FAILED_PREV (U8_MAX - 1) -#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) -#define BFLOAT_FAILED (U8_MAX - 2) - -#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) +#define BFLOAT_FAILED_UNPACKED U8_MAX +#define BFLOAT_FAILED U8_MAX struct bkey_float { u8 exponent; u8 key_offset; - union { - u32 mantissa32; - struct { - u16 mantissa16; - u16 _pad; - }; - }; -} __packed; - -#define BFLOAT_32BIT_NR 32U + u16 mantissa; +}; +#define BKEY_MANTISSA_BITS 16 static unsigned bkey_float_byte_offset(unsigned idx) { - int d = (idx - BFLOAT_32BIT_NR) << 1; - - d &= ~(d >> 31); - - return idx * 6 - d; + return idx * sizeof(struct bkey_float); } struct ro_aux_tree { - struct bkey_float _d[0]; + struct bkey_float f[0]; }; struct rw_aux_tree { @@ -380,8 +365,8 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) return t->aux_data_offset; case BSET_RO_AUX_TREE: return t->aux_data_offset + - DIV_ROUND_UP(bkey_float_byte_offset(t->size) + - sizeof(u8) * t->size, 8); + DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + + t->size * sizeof(u8), 8); case BSET_RW_AUX_TREE: return t->aux_data_offset + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); @@ -420,17 +405,11 @@ static u8 *ro_aux_tree_prev(const struct btree *b, return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); } -static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, - unsigned idx) -{ - return (void *) b + bkey_float_byte_offset(idx); -} - static struct bkey_float *bkey_float(const struct btree *b, const struct bset_tree *t, unsigned idx) { - return bkey_float_get(ro_aux_tree_base(b, t), idx); + return ro_aux_tree_base(b, t)->f + idx; } static void bset_aux_tree_verify(struct btree *b) @@ -669,21 +648,6 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, return idx; } -static inline unsigned bfloat_mantissa(const struct bkey_float *f, - unsigned idx) -{ - return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; -} - -static inline void bfloat_mantissa_set(struct bkey_float *f, - unsigned idx, unsigned mantissa) -{ - if (idx < BFLOAT_32BIT_NR) - f->mantissa32 = mantissa; - else - f->mantissa16 = mantissa; -} - static inline unsigned bkey_mantissa(const struct bkey_packed *k, const struct bkey_float *f, unsigned idx) @@ -703,9 +667,9 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ v >>= f->exponent & 7; #else - v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); + v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; #endif - return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; + return (u16) v; } static void make_bfloat(struct btree *b, struct bset_tree *t, @@ -715,14 +679,10 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *p = tree_to_prev_bkey(b, t, j); struct bkey_packed *l, *r; - unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; unsigned mantissa; int shift, exponent, high_bit; - EBUG_ON(bkey_next(p) != m); - if (is_power_of_2(j)) { l = min_key; @@ -764,8 +724,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, * the original key. */ - if (!bkey_packed(l) || !bkey_packed(r) || - !bkey_packed(p) || !bkey_packed(m) || + if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || !b->nr_key_bits) { f->exponent = BFLOAT_FAILED_UNPACKED; return; @@ -782,8 +741,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, * of the key: we handle this later: */ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), - min_t(unsigned, bits, b->nr_key_bits) - 1); - exponent = high_bit - (bits - 1); + min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); + exponent = high_bit - (BKEY_MANTISSA_BITS - 1); /* * Then we calculate the actual shift value, from the start of the key @@ -792,12 +751,12 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - EBUG_ON(shift + bits > b->format.key_u64s * 64); + EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); #else shift = high_bit_offset + b->nr_key_bits - exponent - - bits; + BKEY_MANTISSA_BITS; EBUG_ON(shift < KEY_PACKED_BITS_START); #endif @@ -813,37 +772,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, if (exponent < 0) mantissa |= ~(~0U << -exponent); - bfloat_mantissa_set(f, j, mantissa); - - /* - * The bfloat must be able to tell its key apart from the previous key - - * if its key and the previous key don't differ in the required bits, - * flag as failed - unless the keys are actually equal, in which case - * we aren't required to return a specific one: - */ - if (exponent > 0 && - bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && - bkey_cmp_packed(b, p, m)) { - f->exponent = BFLOAT_FAILED_PREV; - return; - } - - /* - * f->mantissa must compare >= the original key - for transitivity with - * the comparison in bset_search_tree. If we're dropping set bits, - * increment it: - */ - if (exponent > (int) bch2_bkey_ffs(b, m)) { - if (j < BFLOAT_32BIT_NR - ? f->mantissa32 == U32_MAX - : f->mantissa16 == U16_MAX) - f->exponent = BFLOAT_FAILED_OVERFLOW; - - if (j < BFLOAT_32BIT_NR) - f->mantissa32++; - else - f->mantissa16++; - } + f->mantissa = mantissa; } /* bytes remaining - only valid for last bset: */ @@ -856,14 +785,8 @@ static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) { - unsigned bytes = __bset_tree_capacity(b, t); - - if (bytes < 7 * BFLOAT_32BIT_NR) - return bytes / 7; - - bytes -= 7 * BFLOAT_32BIT_NR; - - return BFLOAT_32BIT_NR + bytes / 5; + return __bset_tree_capacity(b, t) / + (sizeof(struct bkey_float) + sizeof(u8)); } static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) @@ -1333,14 +1256,38 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, return rw_aux_to_bkey(b, t, l); } -noinline -static int bset_search_tree_slowpath(const struct btree *b, - struct bset_tree *t, struct bpos *search, - const struct bkey_packed *packed_search, - unsigned n) +static inline void prefetch_four_cachelines(void *p) { - return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), - packed_search, search) < 0; +#ifdef CONFIG_X86_64 + asm(".intel_syntax noprefix;" + "prefetcht0 [%0 - 127 + 64 * 0];" + "prefetcht0 [%0 - 127 + 64 * 1];" + "prefetcht0 [%0 - 127 + 64 * 2];" + "prefetcht0 [%0 - 127 + 64 * 3];" + ".att_syntax prefix;" + : + : "r" (p + 127)); +#else + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + prefetch(p + L1_CACHE_BYTES * 3); +#endif +} + +static inline bool bkey_mantissa_bits_dropped(const struct btree *b, + const struct bkey_float *f, + unsigned idx) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; + + return f->exponent > key_bits_start; +#else + unsigned key_bits_end = high_bit_offset + b->nr_key_bits; + + return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; +#endif } __flatten @@ -1350,44 +1297,37 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, const struct bkey_packed *packed_search) { struct ro_aux_tree *base = ro_aux_tree_base(b, t); - struct bkey_float *f = bkey_float_get(base, 1); - void *p; - unsigned inorder, n = 1; + struct bkey_float *f; + struct bkey_packed *k; + unsigned inorder, n = 1, l, r; + int cmp; - while (1) { - if (likely(n << 4 < t->size)) { - p = bkey_float_get(base, n << 4); - prefetch(p); - } else if (n << 3 < t->size) { - inorder = __eytzinger1_to_inorder(n, t->size, t->extra); - p = bset_cacheline(b, t, inorder); -#ifdef CONFIG_X86_64 - asm(".intel_syntax noprefix;" - "prefetcht0 [%0 - 127 + 64 * 0];" - "prefetcht0 [%0 - 127 + 64 * 1];" - "prefetcht0 [%0 - 127 + 64 * 2];" - "prefetcht0 [%0 - 127 + 64 * 3];" - ".att_syntax prefix;" - : - : "r" (p + 127)); -#else - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - prefetch(p + L1_CACHE_BYTES * 3); -#endif - } else if (n >= t->size) - break; + do { + if (likely(n << 4 < t->size)) + prefetch(&base->f[n << 4]); - f = bkey_float_get(base, n); + f = &base->f[n]; - if (packed_search && - likely(f->exponent < BFLOAT_FAILED)) - n = n * 2 + (bfloat_mantissa(f, n) < - bkey_mantissa(packed_search, f, n)); - else - n = n * 2 + bset_search_tree_slowpath(b, t, - search, packed_search, n); + if (!unlikely(packed_search)) + goto slowpath; + if (unlikely(f->exponent >= BFLOAT_FAILED)) + goto slowpath; + + l = f->mantissa; + r = bkey_mantissa(packed_search, f, n); + + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) + goto slowpath; + + n = n * 2 + (l < r); + continue; +slowpath: + k = tree_to_bkey(b, t, n); + cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); + if (!cmp) + return k; + + n = n * 2 + (cmp < 0); } while (n < t->size); inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); @@ -1396,29 +1336,23 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, * n would have been the node we recursed to - the low bit tells us if * we recursed left or recursed right. */ - if (n & 1) { - return cacheline_to_bkey(b, t, inorder, f->key_offset); - } else { - if (--inorder) { - n = eytzinger1_prev(n >> 1, t->size); - f = bkey_float_get(base, n); - return cacheline_to_bkey(b, t, inorder, f->key_offset); - } else + if (likely(!(n & 1))) { + --inorder; + if (unlikely(!inorder)) return btree_bkey_first(b, t); + + f = &base->f[eytzinger1_prev(n >> 1, t->size)]; } + + return cacheline_to_bkey(b, t, inorder, f->key_offset); } -/* - * Returns the first key greater than or equal to @search - */ -__always_inline __flatten -static struct bkey_packed *bch2_bset_search(struct btree *b, +static __always_inline __flatten +struct bkey_packed *__bch2_bset_search(struct btree *b, struct bset_tree *t, struct bpos *search, - struct bkey_packed *packed_search, const struct bkey_packed *lossy_packed_search) { - struct bkey_packed *m; /* * First, we search for a cacheline, then lastly we do a linear search @@ -1437,11 +1371,9 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, switch (bset_aux_tree_type(t)) { case BSET_NO_AUX_TREE: - m = btree_bkey_first(b, t); - break; + return btree_bkey_first(b, t); case BSET_RW_AUX_TREE: - m = bset_search_write_set(b, t, search, lossy_packed_search); - break; + return bset_search_write_set(b, t, search, lossy_packed_search); case BSET_RO_AUX_TREE: /* * Each node in the auxiliary search tree covers a certain range @@ -1453,10 +1385,20 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, if (bkey_cmp(*search, t->max_key) > 0) return btree_bkey_last(b, t); - m = bset_search_tree(b, t, search, lossy_packed_search); - break; + return bset_search_tree(b, t, search, lossy_packed_search); + default: + unreachable(); } +} +static __always_inline __flatten +struct bkey_packed *bch2_bset_search_linear(struct btree *b, + struct bset_tree *t, + struct bpos *search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search, + struct bkey_packed *m) +{ if (lossy_packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, @@ -1479,6 +1421,23 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, return m; } +/* + * Returns the first key greater than or equal to @search + */ +static __always_inline __flatten +struct bkey_packed *bch2_bset_search(struct btree *b, + struct bset_tree *t, + struct bpos *search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search) +{ + struct bkey_packed *m = __bch2_bset_search(b, t, search, + lossy_packed_search); + + return bch2_bset_search_linear(b, t, search, + packed_search, lossy_packed_search, m); +} + /* Btree node iterator */ static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, @@ -1569,9 +1528,10 @@ __flatten void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { - struct bset_tree *t; struct bkey_packed p, *packed_search = NULL; struct btree_node_iter_set *pos = iter->data; + struct bkey_packed *k[MAX_BSETS]; + unsigned i; EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); bset_aux_tree_verify(b); @@ -1590,14 +1550,20 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, return; } - for_each_bset(b, t) { - struct bkey_packed *k = bch2_bset_search(b, t, search, - packed_search, &p); + for (i = 0; i < b->nsets; i++) { + k[i] = __bch2_bset_search(b, b->set + i, search, &p); + prefetch_four_cachelines(k[i]); + } + + for (i = 0; i < b->nsets; i++) { + struct bset_tree *t = b->set + i; struct bkey_packed *end = btree_bkey_last(b, t); - if (k != end) + k[i] = bch2_bset_search_linear(b, t, search, + packed_search, &p, k[i]); + if (k[i] != end) *pos++ = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, k[i]), __btree_node_key_to_offset(b, end) }; } @@ -1794,17 +1760,9 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) stats->floats += t->size - 1; for (j = 1; j < t->size; j++) - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: - stats->failed_unpacked++; - break; - case BFLOAT_FAILED_PREV: - stats->failed_prev++; - break; - case BFLOAT_FAILED_OVERFLOW: - stats->failed_overflow++; - break; - } + stats->failed += + bkey_float(b, t, j)->exponent == + BFLOAT_FAILED; } } } @@ -1813,9 +1771,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, struct bkey_packed *k) { struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey_packed *l, *r, *p; - struct bkey uk, up; - char buf1[200], buf2[200]; + struct bkey uk; unsigned j, inorder; if (out->pos != out->end) @@ -1833,7 +1789,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, return; switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: + case BFLOAT_FAILED: uk = bkey_unpack_key(b, k); pr_buf(out, " failed unpacked at depth %u\n" @@ -1841,41 +1797,5 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ilog2(j), uk.p.inode, uk.p.offset); break; - case BFLOAT_FAILED_PREV: - p = tree_to_prev_bkey(b, t, j); - l = is_power_of_2(j) - ? btree_bkey_first(b, t) - : tree_to_prev_bkey(b, t, j >> ffs(j)); - r = is_power_of_2(j + 1) - ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - up = bkey_unpack_key(b, p); - uk = bkey_unpack_key(b, k); - bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); - bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); - - pr_buf(out, - " failed prev at depth %u\n" - "\tkey starts at bit %u but first differing bit at %u\n" - "\t%llu:%llu\n" - "\t%llu:%llu\n" - "\t%s\n" - "\t%s\n", - ilog2(j), - bch2_bkey_greatest_differing_bit(b, l, r), - bch2_bkey_greatest_differing_bit(b, p, k), - uk.p.inode, uk.p.offset, - up.p.inode, up.p.offset, - buf1, buf2); - break; - case BFLOAT_FAILED_OVERFLOW: - uk = bkey_unpack_key(b, k); - pr_buf(out, - " failed overflow at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - break; } } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 643bd9e8..ccc0866d 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -582,9 +582,7 @@ struct bset_stats { } sets[BSET_TREE_NR_TYPES]; size_t floats; - size_t failed_unpacked; - size_t failed_prev; - size_t failed_overflow; + size_t failed; }; void bch2_btree_keys_stats(struct btree *, struct bset_stats *); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 41694951..5d3acba5 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -909,9 +909,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" - " failed unpacked %zu\n" - " failed prev %zu\n" - " failed overflow %zu\n", + " failed unpacked %zu\n", f->key_u64s, f->bits_per_field[0], f->bits_per_field[1], @@ -928,7 +926,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, - stats.failed_unpacked, - stats.failed_prev, - stats.failed_overflow); + stats.failed); } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 5d4a2cb8..a4180124 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1096,7 +1096,12 @@ static int btree_iter_traverse_one(struct btree_iter *iter) if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + /* + * if we need interior nodes locked, call btree_iter_relock() to make + * sure we walk back up enough that we lock them: + */ + if (iter->uptodate == BTREE_ITER_NEED_RELOCK || + iter->locks_want > 1) bch2_btree_iter_relock(iter, false); if (iter->uptodate < BTREE_ITER_NEED_RELOCK) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index c4183982..8d223aa2 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1464,7 +1464,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bkey_s_c k; struct bkey_alloc_unpacked u; struct bkey_i_alloc *a; - unsigned old; + u16 *dst_sectors; bool overflow; int ret; @@ -1519,22 +1519,24 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, goto out; } - if (!p.ptr.cached) { - old = u.dirty_sectors; - overflow = checked_add(u.dirty_sectors, sectors); - } else { - old = u.cached_sectors; - overflow = checked_add(u.cached_sectors, sectors); + dst_sectors = !p.ptr.cached + ? &u.dirty_sectors + : &u.cached_sectors; + + overflow = checked_add(*dst_sectors, sectors); + + if (overflow) { + bch2_fs_inconsistent(c, + "bucket sector count overflow: %u + %lli > U16_MAX", + *dst_sectors, sectors); + /* return an error indicating that we need full fsck */ + ret = -EIO; + goto out; } u.data_type = u.dirty_sectors || u.cached_sectors ? data_type : 0; - bch2_fs_inconsistent_on(overflow, c, - "bucket sector count overflow: %u + %lli > U16_MAX", - old, sectors); - BUG_ON(overflow); - a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); ret = PTR_ERR_OR_ZERO(a); if (ret) diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 8ac6990c..f1826633 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -135,17 +135,16 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, return ret; } -void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw) +void __bch2_increment_clock(struct io_clock *clock) { - struct io_clock *clock = &c->io_clock[rw]; struct io_timer *timer; unsigned long now; + unsigned sectors; /* Buffer up one megabyte worth of IO in the percpu counter */ preempt_disable(); - if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < - IO_CLOCK_PCPU_SECTORS)) { + if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) { preempt_enable(); return; } diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h index 5cb043c5..bfbbca8a 100644 --- a/libbcachefs/clock.h +++ b/libbcachefs/clock.h @@ -6,7 +6,18 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, unsigned long); -void bch2_increment_clock(struct bch_fs *, unsigned, int); + +void __bch2_increment_clock(struct io_clock *); + +static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, + int rw) +{ + struct io_clock *clock = &c->io_clock[rw]; + + if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= + IO_CLOCK_PCPU_SECTORS)) + __bch2_increment_clock(clock); +} void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 304ff925..5a5cfee6 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -64,7 +64,7 @@ void bch2_io_error(struct bch_dev *ca) enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) { - struct fsck_err_state *s; + struct fsck_err_state *s = NULL; va_list args; bool fix = false, print = true, suppressing = false; char _buf[sizeof(s->buf)], *buf = _buf; @@ -99,8 +99,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, found: list_move(&s->list, &c->fsck_errors); s->nr++; - suppressing = s->nr == FSCK_ERR_RATELIMIT_NR; - print = s->nr <= FSCK_ERR_RATELIMIT_NR; + if (c->opts.ratelimit_errors && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; + else + print = false; + } buf = s->buf; print: va_start(args, fmt); @@ -156,7 +161,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_lock(&c->fsck_error_lock); list_for_each_entry_safe(s, n, &c->fsck_errors, list) { - if (s->nr > FSCK_ERR_RATELIMIT_NR) + if (s->ratelimited) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); list_del(&s->list); diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 2591e123..7dcb0f65 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -114,6 +114,7 @@ struct fsck_err_state { struct list_head list; const char *fmt; u64 nr; + bool ratelimited; char buf[512]; }; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 4cc2a4b1..b9c69792 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1218,7 +1218,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, struct bkey_i whiteout = *insert; struct bkey_packed *_k; struct bkey unpacked; - BKEY_PADDED(k) tmp; EBUG_ON(iter->level); EBUG_ON(!insert->k.size); @@ -1292,25 +1291,23 @@ next: bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); if (update_btree) { - bkey_copy(&tmp.k, insert); - if (deleting) - tmp.k.k.type = KEY_TYPE_discard; + insert->k.type = KEY_TYPE_discard; - EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - extent_bset_insert(c, iter, &tmp.k); + extent_bset_insert(c, iter, insert); } if (update_journal) { - bkey_copy(&tmp.k, !deleting ? insert : &whiteout); + struct bkey_i *k = !deleting ? insert : &whiteout; if (deleting) - tmp.k.k.type = KEY_TYPE_discard; + k->k.type = KEY_TYPE_discard; - EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); + EBUG_ON(bkey_deleted(&k->k) || !k->k.size); - bch2_btree_journal_key(trans, iter, &tmp.k); + bch2_btree_journal_key(trans, iter, k); } bch2_cut_front(insert->k.p, insert); @@ -1390,16 +1387,18 @@ static unsigned bch2_crc_field_size_max[] = { }; static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src) + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) { #define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ _dst.csum_type = _src.csum_type, \ _dst.compression_type = _src.compression_type, \ _dst._compressed_size = _src.compressed_size - 1, \ _dst._uncompressed_size = _src.uncompressed_size - 1, \ _dst.offset = _src.offset - switch (extent_entry_type(to_entry(dst))) { + switch (type) { case BCH_EXTENT_ENTRY_crc32: set_common_fields(dst->crc32, src); dst->crc32.csum = *((__le32 *) &src.csum.lo); @@ -1426,23 +1425,24 @@ void bch2_extent_crc_append(struct bkey_i *k, { struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; if (bch_crc_bytes[new.csum_type] <= 4 && new.uncompressed_size - 1 <= CRC32_SIZE_MAX && new.nonce <= CRC32_NONCE_MAX) - crc->type = 1 << BCH_EXTENT_ENTRY_crc32; + type = BCH_EXTENT_ENTRY_crc32; else if (bch_crc_bytes[new.csum_type] <= 10 && new.uncompressed_size - 1 <= CRC64_SIZE_MAX && new.nonce <= CRC64_NONCE_MAX) - crc->type = 1 << BCH_EXTENT_ENTRY_crc64; + type = BCH_EXTENT_ENTRY_crc64; else if (bch_crc_bytes[new.csum_type] <= 16 && new.uncompressed_size - 1 <= CRC128_SIZE_MAX && new.nonce <= CRC128_NONCE_MAX) - crc->type = 1 << BCH_EXTENT_ENTRY_crc128; + type = BCH_EXTENT_ENTRY_crc128; else BUG(); - bch2_extent_crc_pack(crc, new); + bch2_extent_crc_pack(crc, new, type); k->k.u64s += extent_entry_u64s(ptrs.end); @@ -1645,7 +1645,8 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, crc_l.uncompressed_size += crc_r.uncompressed_size; crc_l.compressed_size += crc_r.compressed_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); } bch2_key_resize(l.k, l.k->size + r.k->size); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 90a9bfa4..fd6eb00e 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -507,12 +507,25 @@ static void bch2_set_page_dirty(struct bch_fs *c, __set_page_dirty_nobuffers(page); } +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + int ret; + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + ret = filemap_fault(vmf); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + + return ret; +} + vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct file *file = vmf->vma->vm_file; struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = inode->v.i_mapping; + struct address_space *mapping = file->f_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_page_reservation res; unsigned len; @@ -530,8 +543,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) * a write_invalidate_inode_pages_range() that works without dropping * page lock before invalidating page */ - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); lock_page(page); isize = i_size_read(&inode->v); @@ -551,14 +563,13 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) } bch2_set_page_dirty(c, inode, page, &res, 0, len); + bch2_page_reservation_put(c, inode, &res); + wait_for_stable_page(page); out: - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); sb_end_pagefault(inode->v.i_sb); - bch2_page_reservation_put(c, inode, &res); - return ret; } @@ -888,8 +899,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_SLOTS); - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); while ((page = readpage_iter_next(&readpages_iter))) { pgoff_t index = readpages_iter.offset + readpages_iter.idx; @@ -912,8 +922,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, &readpages_iter); } - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); bch2_trans_exit(&trans); kfree(readpages_iter.pages); @@ -1294,8 +1303,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_page_reservation_init(c, inode, res); *fsdata = res; - /* Not strictly necessary - same reason as mkwrite(): */ - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -1347,7 +1355,7 @@ err: put_page(page); *pagep = NULL; err_unlock: - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); kfree(res); *fsdata = NULL; return ret; @@ -1391,7 +1399,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); bch2_page_reservation_put(c, inode, res); kfree(res); @@ -1549,7 +1557,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ssize_t written = 0; int ret = 0; - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); do { unsigned offset = pos & (PAGE_SIZE - 1); @@ -1606,7 +1614,7 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); return written ? written : ret; } @@ -1730,6 +1738,43 @@ start: } } +ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + return ret; + + file_accessed(file); + + blk_start_plug(&plug); + ret = bch2_direct_IO_read(iocb, iter); + blk_finish_plug(&plug); + + if (ret >= 0) + iocb->ki_pos += ret; + } else { + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + ret = generic_file_read_iter(iocb, iter); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + } + + return ret; +} + /* O_DIRECT writes */ static long bch2_dio_write_loop(struct dio_write *dio) @@ -1744,34 +1789,23 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bio_vec *bv; unsigned unaligned; u64 new_i_size; - loff_t offset; bool sync; long ret; if (dio->loop) goto loop; - /* Write and invalidate pagecache range that we're writing to: */ - offset = req->ki_pos + (dio->op.written << 9); - ret = write_invalidate_inode_pages_range(mapping, - offset, - offset + iov_iter_count(&dio->iter) - 1); - if (unlikely(ret)) - goto err; - while (1) { - offset = req->ki_pos + (dio->op.written << 9); - - BUG_ON(current->pagecache_lock); - current->pagecache_lock = &mapping->add_lock; if (kthread) use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; ret = bio_iov_iter_get_pages(bio, &dio->iter); + current->faults_disabled_mapping = NULL; if (kthread) unuse_mm(dio->mm); - current->pagecache_lock = NULL; if (unlikely(ret < 0)) goto err; @@ -1791,14 +1825,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) goto err; } - /* gup might have faulted pages back in: */ - ret = write_invalidate_inode_pages_range(mapping, - offset, - offset + bio->bi_iter.bi_size - 1); - if (unlikely(ret)) - goto err; - - dio->op.pos = POS(inode->v.i_ino, offset >> 9); + dio->op.pos = POS(inode->v.i_ino, + (req->ki_pos >> 9) + dio->op.written); task_io_account_write(bio->bi_iter.bi_size); @@ -1850,7 +1878,7 @@ loop: ret = dio->op.error ?: ((long) dio->op.written << 9); err: - __pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); bch2_disk_reservation_put(c, &dio->op.res); bch2_quota_reservation_put(c, inode, &dio->quota_res); @@ -1916,7 +1944,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) goto err; inode_dio_begin(&inode->v); - __pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); extending = req->ki_pos + iter->count > inode->v.i_size; if (!extending) { @@ -1964,6 +1992,12 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) dio->op.opts.data_replicas)) goto err_put_bio; + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + ret = bch2_dio_write_loop(dio); err: if (locked) @@ -1972,7 +2006,7 @@ err: req->ki_pos += ret; return ret; err_put_bio: - __pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); bch2_disk_reservation_put(c, &dio->op.res); bch2_quota_reservation_put(c, inode, &dio->quota_res); bio_put(bio); @@ -1980,21 +2014,6 @@ err_put_bio: goto err; } -ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) -{ - struct blk_plug plug; - ssize_t ret; - - if (iov_iter_rw(iter) == WRITE) - return -EINVAL; - - blk_start_plug(&plug); - ret = bch2_direct_IO_read(req, iter); - blk_finish_plug(&plug); - - return ret; -} - ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -2236,7 +2255,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) int ret = 0; inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); /* * fetch current on disk i_size: inode is locked, i_size can only @@ -2307,7 +2326,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); err: - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); return ret; } @@ -2316,14 +2335,13 @@ err: static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; u64 discard_start = round_up(offset, block_bytes(c)) >> 9; u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; int ret = 0; inode_lock(&inode->v); inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); ret = __bch2_truncate_page(inode, offset >> PAGE_SHIFT, @@ -2352,7 +2370,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len i_sectors_acct(c, inode, NULL, i_sectors_delta); } err: - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; @@ -2383,7 +2401,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, */ inode_lock(&inode->v); inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); if (insert) { ret = -EFBIG; @@ -2570,7 +2588,7 @@ bkey_err: } err: bch2_trans_exit(&trans); - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; } @@ -2594,7 +2612,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, inode_lock(&inode->v); inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ret = inode_newsize_ok(&inode->v, end); @@ -2737,7 +2755,7 @@ bkey_err: } err: bch2_trans_exit(&trans); - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; } @@ -2813,8 +2831,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct bch_inode_info *dst = file_bch_inode(file_dst); struct bch_fs *c = src->v.i_sb->s_fs_info; s64 i_sectors_delta = 0; + u64 aligned_len; loff_t ret = 0; - loff_t aligned_len; if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) return -EINVAL; @@ -2830,26 +2848,23 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, abs(pos_src - pos_dst) < len) return -EINVAL; - bch2_lock_inodes(INODE_LOCK, src, dst); + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); file_update_time(file_dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); - __pagecache_block_get(&src->v.i_mapping->add_lock); - __pagecache_block_get(&dst->v.i_mapping->add_lock); - ret = generic_remap_file_range_prep(file_src, pos_src, file_dst, pos_dst, &len, remap_flags); if (ret < 0 || len == 0) goto err; - aligned_len = round_up(len, block_bytes(c)); + aligned_len = round_up((u64) len, block_bytes(c)); ret = write_invalidate_inode_pages_range(dst->v.i_mapping, - pos_dst, pos_dst + aligned_len); + pos_dst, pos_dst + len - 1); if (ret) goto err; @@ -2864,24 +2879,20 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret < 0) goto err; - ret <<= 9; /* * due to alignment, we might have remapped slightly more than requsted */ - ret = min(ret, len); + ret = min((u64) ret << 9, (u64) len); /* XXX get a quota reservation */ i_sectors_acct(c, dst, NULL, i_sectors_delta); spin_lock(&dst->v.i_lock); - if (pos_dst + len > dst->v.i_size) - i_size_write(&dst->v, pos_dst + len); + if (pos_dst + ret > dst->v.i_size) + i_size_write(&dst->v, pos_dst + ret); spin_unlock(&dst->v.i_lock); err: - __pagecache_block_put(&dst->v.i_mapping->add_lock); - __pagecache_block_put(&src->v.i_mapping->add_lock); - - bch2_unlock_inodes(INODE_LOCK, src, dst); + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); return ret; } diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index ae171a29..7063556d 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -27,8 +27,7 @@ int bch2_write_begin(struct file *, struct address_space *, loff_t, int bch2_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); -ssize_t bch2_direct_IO(struct kiocb *, struct iov_iter *); - +ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); int bch2_fsync(struct file *, loff_t, loff_t, int); @@ -41,6 +40,7 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t bch2_llseek(struct file *, loff_t, int); +vm_fault_t bch2_page_fault(struct vm_fault *); vm_fault_t bch2_page_mkwrite(struct vm_fault *); void bch2_invalidatepage(struct page *, unsigned int, unsigned int); int bch2_releasepage(struct page *, gfp_t); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index f9b3650b..cd3540d0 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -49,6 +49,53 @@ static void journal_seq_copy(struct bch_inode_info *dst, } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); } +static void __pagecache_lock_put(struct pagecache_lock *lock, long i) +{ + BUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) +{ + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +static void __pagecache_lock_get(struct pagecache_lock *lock, long i) +{ + wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); +} + +void bch2_pagecache_add_put(struct pagecache_lock *lock) +{ + __pagecache_lock_put(lock, 1); +} + +void bch2_pagecache_add_get(struct pagecache_lock *lock) +{ + __pagecache_lock_get(lock, 1); +} + +void bch2_pagecache_block_put(struct pagecache_lock *lock) +{ + __pagecache_lock_put(lock, -1); +} + +void bch2_pagecache_block_get(struct pagecache_lock *lock) +{ + __pagecache_lock_get(lock, -1); +} + void bch2_inode_update_after_write(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -706,10 +753,15 @@ static int bch2_getattr(const struct path *path, struct kstat *stat, if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= STATX_ATTR_IMMUTABLE; + if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= STATX_ATTR_APPEND; + if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= STATX_ATTR_NODUMP; return 0; } @@ -872,7 +924,7 @@ retry: } static const struct vm_operations_struct bch_vm_ops = { - .fault = filemap_fault, + .fault = bch2_page_fault, .map_pages = filemap_map_pages, .page_mkwrite = bch2_page_mkwrite, }; @@ -906,7 +958,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, - .read_iter = generic_file_read_iter, + .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, .open = generic_file_open, @@ -994,7 +1046,7 @@ static const struct address_space_operations bch_address_space_operations = { .write_end = bch2_write_end, .invalidatepage = bch2_invalidatepage, .releasepage = bch2_releasepage, - .direct_IO = bch2_direct_IO, + .direct_IO = noop_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = bch2_migrate_page, #endif @@ -1090,6 +1142,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); + pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); inode->ei_journal_seq = 0; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 40605666..eda903a4 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -10,6 +10,26 @@ #include #include +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +struct pagecache_lock { + atomic_long_t v; + wait_queue_head_t wait; +}; + +static inline void pagecache_lock_init(struct pagecache_lock *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +void bch2_pagecache_add_put(struct pagecache_lock *); +void bch2_pagecache_add_get(struct pagecache_lock *); +void bch2_pagecache_block_put(struct pagecache_lock *); +void bch2_pagecache_block_get(struct pagecache_lock *); + struct bch_inode_info { struct inode v; @@ -18,6 +38,8 @@ struct bch_inode_info { u64 ei_quota_reserved; unsigned long ei_last_dirtied; + struct pagecache_lock ei_pagecache_lock; + struct mutex ei_quota_lock; struct bch_qid ei_qid; @@ -37,7 +59,8 @@ static inline int ptrcmp(void *l, void *r) enum bch_inode_lock_op { INODE_LOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), + INODE_PAGECACHE_BLOCK = (1U << 1), + INODE_UPDATE_LOCK = (1U << 2), }; #define bch2_lock_inodes(_locks, ...) \ @@ -49,9 +72,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ down_write_nested(&a[i]->v.i_rwsem, i); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ } \ } while (0) @@ -65,9 +90,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ up_write(&a[i]->v.i_rwsem); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_unlock(&a[i]->ei_update_lock); \ } \ } while (0) diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 3cced2b9..0f2308e5 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -797,7 +797,7 @@ create_lostfound: bch2_create_trans(&trans, BCACHEFS_ROOT_INO, root_inode, lostfound_inode, &lostfound, - 0, 0, S_IFDIR|0755, 0, NULL, NULL)); + 0, 0, S_IFDIR|0700, 0, NULL, NULL)); if (ret) bch_err(c, "error creating lost+found: %i", ret); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 836004b1..e3ef662e 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1270,7 +1270,6 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) closure_return_with_destructor(cl, promote_done); } -noinline static struct promote_op *__promote_alloc(struct bch_fs *c, enum btree_id btree_id, struct bpos pos, @@ -1344,7 +1343,8 @@ err: return NULL; } -static inline struct promote_op *promote_alloc(struct bch_fs *c, +noinline +static struct promote_op *promote_alloc(struct bch_fs *c, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, @@ -1908,7 +1908,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (pick.crc.compression_type != BCH_COMPRESSION_NONE || (pick.crc.csum_type != BCH_CSUM_NONE && @@ -1920,8 +1920,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, bounce = true; } - promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); + if (orig->opts.promote_target) + promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); if (!read_full) { EBUG_ON(pick.crc.compression_type); @@ -1949,7 +1950,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, * data in the write path, but we're not going to use it all * here: */ - BUG_ON(rbio->bio.bi_iter.bi_size < + EBUG_ON(rbio->bio.bi_iter.bi_size < pick.crc.compressed_size << 9); rbio->bio.bi_iter.bi_size = pick.crc.compressed_size << 9; @@ -1982,10 +1983,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, noclone: rbio = orig; rbio->bio.bi_iter = iter; - BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->c = c; rbio->submit_time = local_clock(); @@ -2001,6 +2002,7 @@ noclone: rbio->hole = 0; rbio->retry = 0; rbio->context = 0; + /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->pos = pos; @@ -2017,11 +2019,11 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - percpu_down_read(&c->mark_lock); + rcu_read_lock(); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - percpu_up_read(&c->mark_lock); + rcu_read_unlock(); - if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); trace_read_split(&orig->bio); } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index bd2058f1..0ec0999a 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -68,6 +68,12 @@ enum opt_type { * - helptext */ +#ifdef __KERNEL__ +#define RATELIMIT_ERRORS true +#else +#define RATELIMIT_ERRORS false +#endif + #define BCH_OPTS() \ x(block_size, u16, \ OPT_FORMAT, \ @@ -227,6 +233,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, RATELIMIT_ERRORS, \ + NULL, "Ratelimit error messages during fsck") \ x(nochanges, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 6d45ae24..6e71c5e8 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -290,10 +290,12 @@ err: ret2 = PTR_ERR_OR_ZERO(inode_iter); if (!ret2 && - inode_u.bi_size < new_i_size) + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, journal_seq, BTREE_INSERT_ATOMIC); + } } while (ret2 == -EINTR); ret = bch2_trans_exit(&trans) ?: ret;