Update bcachefs sources to 83338f5b2cb8 bcachefs: fix for building in userspace

This commit is contained in:
Kent Overstreet 2024-03-16 19:29:22 -04:00
parent f1e87c66af
commit abfdc593a5
74 changed files with 1807 additions and 1273 deletions

View File

@ -1 +1 @@
26494335d114f7813a7fc499bbacb4a74d613b6f
83338f5b2cb8406cda8bf7be3f566ab97c696917

View File

@ -274,20 +274,6 @@ update-bcachefs-sources:
git add include/linux/kmemleak.h
cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
git add linux/int_sqrt.c
cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
git add linux/mean_and_variance.c
cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
git add include/linux/mean_and_variance.h
cp $(LINUX_DIR)/lib/time_stats.c linux/
git add linux/time_stats.c
cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/
git add include/linux/time_stats.h
cp $(LINUX_DIR)/include/linux/darray.h include/linux/
git add include/linux/darray.h
cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/
git add include/linux/darray_types.h
cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/
git add include/linux/eytzinger.h
cp $(LINUX_DIR)/scripts/Makefile.compiler ./
git add Makefile.compiler
$(RM) libbcachefs/*.mod.c

View File

@ -28,7 +28,7 @@
#include "libbcachefs/super-io.h"
#include "libbcachefs/util.h"
#include "linux/darray.h"
#include "libbcachefs/darray.h"
#define OPTS \
x(0, replicas, required_argument) \

View File

@ -15,7 +15,7 @@
#include "cmds.h"
#include "libbcachefs.h"
#include "linux/darray.h"
#include "libbcachefs/darray.h"
static void __dev_usage_type_to_text(struct printbuf *out,
enum bch_data_type type,

View File

@ -128,7 +128,7 @@ static void journal_entries_print(struct bch_fs *c, unsigned nr_entries,
if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq))
continue;
bool blacklisted = p->ignore ||
bool blacklisted = p->ignore_blacklisted ||
bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(p->j.seq), false);

View File

@ -20,7 +20,7 @@
#include <linux/uuid.h>
#include "libbcachefs/bcachefs.h"
#include "libbcachefs/bbpos.h"
#include "linux/darray.h"
#include "libbcachefs/darray.h"
#define noreturn __attribute__((noreturn))

View File

@ -1,22 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#ifndef _LINUX_DARRAY_TYpES_H
#define _LINUX_DARRAY_TYpES_H
#include <linux/types.h>
#define DARRAY_PREALLOCATED(_type, _nr) \
struct { \
size_t nr, size; \
_type *data; \
_type preallocated[_nr]; \
}
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
#endif /* _LINUX_DARRAY_TYpES_H */

View File

@ -5,7 +5,7 @@
* DOC: Generic radix trees/sparse arrays
*
* Very simple and minimalistic, supporting arbitrary size entries up to
* PAGE_SIZE.
* GENRADIX_NODE_SIZE.
*
* A genradix is defined with the type it will store, like so:
*
@ -45,12 +45,15 @@
struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
struct __genradix {
struct genradix_root *root;
};
/*
* NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
#define __GENRADIX_INITIALIZER \
@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *);
static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
{
if (__builtin_constant_p(obj_size))
BUILD_BUG_ON(obj_size > PAGE_SIZE);
BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE);
else
BUG_ON(obj_size > PAGE_SIZE);
BUG_ON(obj_size > GENRADIX_NODE_SIZE);
if (!is_power_of_2(obj_size)) {
size_t objs_per_page = PAGE_SIZE / obj_size;
size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size;
return (idx / objs_per_page) * PAGE_SIZE +
return (idx / objs_per_page) * GENRADIX_NODE_SIZE +
(idx % objs_per_page) * obj_size;
} else {
return idx * obj_size;
@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_cast(_radix) (typeof((_radix)->type[0]) *)
#define __genradix_obj_size(_radix) sizeof((_radix)->type[0])
#define __genradix_objs_per_page(_radix) \
(PAGE_SIZE / sizeof((_radix)->type[0]))
(GENRADIX_NODE_SIZE / sizeof((_radix)->type[0]))
#define __genradix_page_remainder(_radix) \
(PAGE_SIZE % sizeof((_radix)->type[0]))
(GENRADIX_NODE_SIZE % sizeof((_radix)->type[0]))
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
iter->offset += obj_size;
if (!is_power_of_2(obj_size) &&
(iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
iter->offset = round_up(iter->offset, PAGE_SIZE);
(iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE)
iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE);
iter->pos++;
}
@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter,
return;
}
if ((iter->offset & (PAGE_SIZE - 1)) == 0)
iter->offset -= PAGE_SIZE % obj_size;
if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0)
iter->offset -= GENRADIX_NODE_SIZE % obj_size;
iter->offset -= obj_size;
iter->pos--;
@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter,
genradix_for_each_from(_radix, _iter, _p, 0)
#define genradix_last_pos(_radix) \
(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
(SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1)
/**
* genradix_for_each_reverse - iterate over entry in a genradix, reverse order

View File

@ -1052,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
if (k.k->type != discard_key_type &&
(c->opts.reconstruct_alloc ||
fsck_err(c, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
if (fsck_err_on(k.k->type != discard_key_type,
c, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@ -1083,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
if (k.k->type != freespace_key_type &&
(c->opts.reconstruct_alloc ||
fsck_err(c, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
if (fsck_err_on(k.k->type != freespace_key_type,
c, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@ -1115,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
if (a->gen != alloc_gen(k, gens_offset) &&
(c->opts.reconstruct_alloc ||
fsck_err(c, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
c, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_bucket_gens *g =
bch2_trans_kmalloc(trans, sizeof(*g));
@ -1174,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
if (k.k->type != KEY_TYPE_set &&
(c->opts.reconstruct_alloc ||
fsck_err(c, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
end->offset))) {
if (fsck_err_on(k.k->type != KEY_TYPE_set,
c, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
end->offset)) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));

View File

@ -477,8 +477,7 @@ missing:
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
if (c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;

View File

@ -200,8 +200,6 @@
#include <linux/seqlock.h>
#include <linux/shrinker.h>
#include <linux/srcu.h>
#include <linux/thread_with_file_types.h>
#include <linux/time_stats.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
@ -214,6 +212,7 @@
#include "recovery_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "time_stats.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
@ -470,6 +469,7 @@ enum bch_time_stats {
#include "replicas_types.h"
#include "subvolume_types.h"
#include "super_types.h"
#include "thread_with_file_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@ -598,7 +598,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
struct time_stats_quantiles io_latency[2];
struct bch2_time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@ -645,8 +645,8 @@ struct btree_debug {
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
struct time_stats duration;
struct time_stats lock_hold_times;
struct bch2_time_stats duration;
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
unsigned journal_entries_size;
@ -1111,7 +1111,7 @@ struct bch_fs {
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
struct time_stats times[BCH_TIME_STAT_NR];
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];

View File

@ -4,7 +4,7 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
#include "bkey_types.h"
#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
const struct bkey_format *,
const struct bkey_packed *);
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
const struct bch_val *v;
};
/* bkey with split value */
struct bkey_s {
union {
struct {
struct bkey *k;
struct bch_val *v;
};
struct bkey_s_c s_c;
};
};
#define bkey_p_next(_k) vstruct_next(_k)
static inline struct bkey_i *bkey_next(struct bkey_i *k)
{
return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
}
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
{
return bkey_val_u64s(k) * sizeof(u64);
}
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
unsigned u64s = BKEY_U64s + val_u64s;
BUG_ON(u64s > U8_MAX);
k->u64s = u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
}
#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
EBUG_ON(k->u64s < ret);
return ret;
return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
#define bkey_s_null ((struct bkey_s) { .k = NULL })
#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
static inline struct bkey_s bkey_to_s(struct bkey *k)
{
return (struct bkey_s) { .k = k, .v = NULL };
}
static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
{
return (struct bkey_s_c) { .k = k, .v = NULL };
}
static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
{
return (struct bkey_s) { .k = &k->k, .v = &k->v };
}
static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
{
return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
}
/*
* For a given type of value (e.g. struct bch_extent), generates the types for
* bkey + bch_extent - inline, split, split const - and also all the conversion
* functions, which also check that the value is of the correct type.
*
* We use anonymous unions for upcasting - e.g. converting from e.g. a
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
* functions.
*/
#define x(name, ...) \
struct bkey_i_##name { \
union { \
struct bkey k; \
struct bkey_i k_i; \
}; \
struct bch_##name v; \
}; \
\
struct bkey_s_c_##name { \
union { \
struct { \
const struct bkey *k; \
const struct bch_##name *v; \
}; \
struct bkey_s_c s_c; \
}; \
}; \
\
struct bkey_s_##name { \
union { \
struct { \
struct bkey *k; \
struct bch_##name *v; \
}; \
struct bkey_s_c_##name c; \
struct bkey_s s; \
struct bkey_s_c s_c; \
}; \
}; \
\
static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline const struct bkey_i_##name * \
bkey_i_to_##name##_c(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
{ \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_c_##name \
name##_i_to_s_c(const struct bkey_i_##name *k) \
{ \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name \
bkey_i_to_s_c_##name(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
{ \
struct bkey_i_##name *k = \
container_of(&_k->k, struct bkey_i_##name, k); \
\
bkey_init(&k->k); \
memset(&k->v, 0, sizeof(k->v)); \
k->k.type = KEY_TYPE_##name; \
set_bkey_val_bytes(&k->k, sizeof(k->v)); \
\
return k; \
}
BCH_BKEY_TYPES();
#undef x
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

213
libbcachefs/bkey_types.h Normal file
View File

@ -0,0 +1,213 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BKEY_TYPES_H
#define _BCACHEFS_BKEY_TYPES_H
#include "bcachefs_format.h"
/*
* bkey_i - bkey with inline value
* bkey_s - bkey with split value
* bkey_s_c - bkey with split value, const
*/
#define bkey_p_next(_k) vstruct_next(_k)
static inline struct bkey_i *bkey_next(struct bkey_i *k)
{
return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
}
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
{
return bkey_val_u64s(k) * sizeof(u64);
}
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
unsigned u64s = BKEY_U64s + val_u64s;
BUG_ON(u64s > U8_MAX);
k->u64s = u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
}
#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
const struct bch_val *v;
};
/* bkey with split value */
struct bkey_s {
union {
struct {
struct bkey *k;
struct bch_val *v;
};
struct bkey_s_c s_c;
};
};
#define bkey_s_null ((struct bkey_s) { .k = NULL })
#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
static inline struct bkey_s bkey_to_s(struct bkey *k)
{
return (struct bkey_s) { .k = k, .v = NULL };
}
static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
{
return (struct bkey_s_c) { .k = k, .v = NULL };
}
static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
{
return (struct bkey_s) { .k = &k->k, .v = &k->v };
}
static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
{
return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
}
/*
* For a given type of value (e.g. struct bch_extent), generates the types for
* bkey + bch_extent - inline, split, split const - and also all the conversion
* functions, which also check that the value is of the correct type.
*
* We use anonymous unions for upcasting - e.g. converting from e.g. a
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
* functions.
*/
#define x(name, ...) \
struct bkey_i_##name { \
union { \
struct bkey k; \
struct bkey_i k_i; \
}; \
struct bch_##name v; \
}; \
\
struct bkey_s_c_##name { \
union { \
struct { \
const struct bkey *k; \
const struct bch_##name *v; \
}; \
struct bkey_s_c s_c; \
}; \
}; \
\
struct bkey_s_##name { \
union { \
struct { \
struct bkey *k; \
struct bch_##name *v; \
}; \
struct bkey_s_c_##name c; \
struct bkey_s s; \
struct bkey_s_c s_c; \
}; \
}; \
\
static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline const struct bkey_i_##name * \
bkey_i_to_##name##_c(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
{ \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_c_##name \
name##_i_to_s_c(const struct bkey_i_##name *k) \
{ \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name \
bkey_i_to_s_c_##name(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
{ \
struct bkey_i_##name *k = \
container_of(&_k->k, struct bkey_i_##name, k); \
\
bkey_init(&k->k); \
memset(&k->v, 0, sizeof(k->v)); \
k->k.type = KEY_TYPE_##name; \
set_bkey_val_bytes(&k->k, sizeof(k->v)); \
\
return k; \
}
BCH_BKEY_TYPES();
#undef x
#endif /* _BCACHEFS_BKEY_TYPES_H */

View File

@ -9,12 +9,12 @@
#include "bcachefs.h"
#include "btree_cache.h"
#include "bset.h"
#include "eytzinger.h"
#include "trace.h"
#include "util.h"
#include <asm/unaligned.h>
#include <linux/console.h>
#include <linux/eytzinger.h>
#include <linux/random.h>
#include <linux/prefetch.h>

View File

@ -661,7 +661,7 @@ out:
bch2_btree_keys_init(b);
set_btree_node_accessed(b);
time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
memalloc_nofs_restore(flags);

View File

@ -593,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
if (!g->gen_valid &&
(c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_alloc_key,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (fsck_err_on(!g->gen_valid,
c, ptr_to_missing_alloc_key,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@ -611,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
(c->opts.reconstruct_alloc ||
fsck_err(c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@ -633,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
(c->opts.reconstruct_alloc ||
fsck_err(c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
(c->opts.reconstruct_alloc ||
fsck_err(c, stale_dirty_ptr,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
c, stale_dirty_ptr,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
@ -1366,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
struct bucket gc, *b;
struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
struct bch_alloc_v4 old_convert, new;
const struct bch_alloc_v4 *old;
enum bch_data_type type;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
@ -1378,30 +1373,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
b = gc_bucket(ca, iter->pos.offset);
old_gc = *b;
if ((old->data_type == BCH_DATA_sb ||
old->data_type == BCH_DATA_journal) &&
!bch2_dev_is_online(ca)) {
b->data_type = old->data_type;
b->dirty_sectors = old->dirty_sectors;
}
/*
* b->data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
type = __alloc_data_type(b->dirty_sectors,
b->cached_sectors,
b->stripe,
*old,
b->data_type);
if (b->data_type != type) {
struct bch_dev_usage *u;
preempt_disable();
u = this_cpu_ptr(ca->usage_gc);
u->d[b->data_type].buckets--;
b->data_type = type;
u->d[b->data_type].buckets++;
preempt_enable();
}
b->data_type = __alloc_data_type(b->dirty_sectors,
b->cached_sectors,
b->stripe,
*old,
b->data_type);
gc = *b;
percpu_up_read(&c->mark_lock);
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors)
bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
if (metadata_only &&
gc.data_type != BCH_DATA_sb &&
gc.data_type != BCH_DATA_journal &&
@ -1411,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (gen_after(old->gen, gc.gen))
return 0;
if (c->opts.reconstruct_alloc ||
fsck_err_on(new.data_type != gc.data_type, c,
if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
@ -1423,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
if (c->opts.reconstruct_alloc || \
fsck_err_on(new._f != gc._f, c, _errtype, \
if (fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@ -1586,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
" should be %u",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@ -1596,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
new->k.type = KEY_TYPE_deleted;
else
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
ret = bch2_trans_update(trans, iter, new, 0);
}
fsck_err:
printbuf_exit(&buf);
@ -1818,10 +1812,10 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
ret = bch2_gc_stripes_done(c, metadata_only) ?:
bch2_gc_reflink_done(c, metadata_only) ?:
bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
ret = bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only) ?:
bch2_gc_stripes_done(c, metadata_only) ?:
bch2_gc_reflink_done(c, metadata_only);
bch2_journal_unblock(&c->journal);
}
@ -1971,7 +1965,7 @@ int bch2_gc_gens(struct bch_fs *c)
c->gc_count++;
time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
err:
for_each_member_device(c, ca) {

View File

@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
if (sorting_entire_node)
time_stats_update(&c->times[BCH_TIME_btree_node_sort],
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);
/* Make sure we preserve bset journal_seq: */
@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
&dst->format,
true);
time_stats_update(&c->times[BCH_TIME_btree_node_sort],
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);
set_btree_bset_end(dst, dst->set);
@ -839,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
if (k->u64s < bkeyp_key_u64s(&b->format, k))
return false;
struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@ -880,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
"invalid bkey format %u", k->format))
goto drop_this_key;
/* XXX: validate k->u64s */
if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_u64s,
"k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
goto drop_this_key;
if (!write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
@ -1250,7 +1259,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
out:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@ -1322,7 +1331,7 @@ start:
}
}
time_stats_update(&c->times[BCH_TIME_btree_node_read],
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);

View File

@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
btree_path_set_should_be_locked(trans->paths + iter->path);
struct btree_path *path = btree_iter_path(trans, iter);
if (btree_path_node(path, path->level))
btree_path_set_should_be_locked(path);
return 0;
}
@ -2905,7 +2907,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
time_after64(now, trans->last_begin_time + 10))
__time_stats_update(&btree_trans_stats(trans)->duration,
__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
trans->last_begin_time, now);
if (!trans->restarted &&
@ -3230,7 +3232,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
kfree(s->max_paths_text);
time_stats_exit(&s->lock_hold_times);
bch2_time_stats_exit(&s->lock_hold_times);
}
if (c->btree_trans_barrier_initialized)
@ -3246,8 +3248,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
time_stats_init(&s->duration);
time_stats_init(&s->lock_hold_times);
bch2_time_stats_init(&s->duration);
bch2_time_stats_init(&s->lock_hold_times);
mutex_init(&s->lock);
}

View File

@ -512,7 +512,7 @@ int bch2_journal_keys_sort(struct bch_fs *c)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
cond_resched();

View File

@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct bkey_i *new_k = NULL;
int ret;
k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
BTREE_ITER_KEY_CACHE_FILL|
BTREE_ITER_CACHED_NOFILL);
bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
BTREE_ITER_KEY_CACHE_FILL|
BTREE_ITER_CACHED_NOFILL);
iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;

View File

@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
__time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
path->l[level].lock_taken_time,
local_clock());
#endif

View File

@ -2,13 +2,13 @@
#ifndef _BCACHEFS_BTREE_TYPES_H
#define _BCACHEFS_BTREE_TYPES_H
#include <linux/darray_types.h>
#include <linux/list.h>
#include <linux/rhashtable.h>
#include "bbpos_types.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
#include "journal_types.h"
#include "replicas_types.h"

View File

@ -14,8 +14,6 @@
#include "snapshot.h"
#include "trace.h"
#include <linux/darray.h>
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
@ -454,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
* the key cache - but the key has to exist in the btree for that to
* work:
*/
if (path->cached && bkey_deleted(&i->old_k))
if (path->cached && !i->old_btree_u64s)
return flush_new_cached_update(trans, i, flags, ip);
return 0;

View File

@ -25,8 +25,7 @@
#include <linux/random.h>
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *,
struct keylist *, unsigned);
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@ -517,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
bch2_disk_reservation_put(c, &as->disk_res);
bch2_btree_reserve_put(as, trans);
time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);
mutex_lock(&c->btree_interior_update_lock);
@ -1039,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
start_time);
}
@ -1208,10 +1207,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
!btree_node_dying(btree_node_root(c, b))));
bch2_btree_id_root(c, b->c.btree_id)->b = b;
mutex_unlock(&c->btree_root_lock);
@ -1477,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as,
static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path, struct btree *b,
struct keylist *keys, unsigned flags)
struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(trans->paths + path, b);
@ -1578,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err;
} else if (n3) {
@ -1630,7 +1625,7 @@ out:
bch2_trans_verify_locks(trans);
time_stats_update(&c->times[n2
bch2_time_stats_update(&c->times[n2
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact],
start_time);
@ -1673,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
* @flags: transaction commit flags
*
* Returns: 0 on success, typically transaction restart error on failure
*
@ -1683,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b,
struct keylist *keys, unsigned flags)
struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree_path *path = trans->paths + path_idx;
@ -1739,7 +1733,7 @@ split:
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
return btree_split(as, trans, path_idx, b, keys, flags);
return btree_split(as, trans, path_idx, b, keys);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@ -1747,7 +1741,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
unsigned flags)
{
/* btree_split & merge may both cause paths array to be reallocated */
struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
@ -1759,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
ret = btree_split(as, trans, path, b, NULL, flags);
ret = btree_split(as, trans, path, b, NULL);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
@ -1775,6 +1768,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return ret;
}
static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx)
{
struct bch_fs *c = as->c;
struct btree_path *path = trans->paths + path_idx;
struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
BUG_ON(!btree_node_locked(path, b->c.level));
n = __btree_root_alloc(as, trans, b->c.level + 1);
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
path->locks_want++;
BUG_ON(btree_node_locked(path, n->c.level));
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, path, n);
n->sib_u64s[0] = U16_MAX;
n->sib_u64s[1] = U16_MAX;
bch2_keylist_add(&as->parent_keys, &b->key);
btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
bch2_btree_set_root(as, trans, path, n);
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bch2_trans_node_add(trans, path, n);
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
list_add_tail(&b->list, &c->btree_cache.live);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
}
int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
struct btree_update *as =
bch2_btree_update_start(trans, trans->paths + path,
b->c.level, true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
__btree_increase_depth(as, trans, path);
bch2_btree_update_done(as, trans);
return 0;
}
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_path_idx_t path,
unsigned level,
@ -1915,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err_free_update;
@ -1935,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_done(as, trans);
time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
err:
if (new_path)
@ -1986,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
ret = bch2_btree_insert_node(as, trans, iter->path,
parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
if (ret)
goto err;
} else {

View File

@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#include <linux/darray_types.h>
#include "darray.h"
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4

View File

@ -11,6 +11,7 @@
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "thread_with_file.h"
#include <linux/cdev.h>
#include <linux/device.h>
@ -19,15 +20,8 @@
#include <linux/major.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/thread_with_file.h>
#include <linux/uaccess.h>
__must_check
static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
{
return copy_to_user(to, from, n) ? -EFAULT : 0;
}
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@ -172,9 +166,9 @@ static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
bch2_fs_stop(c);
if (ret & 1)
stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
if (ret & 4)
stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
return ret;
}
@ -236,7 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
ret = run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
err:
if (ret < 0) {
if (thr)
@ -439,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
{
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
thread_with_file_exit(&ctx->thr);
bch2_thread_with_file_exit(&ctx->thr);
kfree(ctx);
return 0;
}
@ -489,7 +483,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
ret = run_thread_with_file(&ctx->thr,
ret = bch2_run_thread_with_file(&ctx->thr,
&bcachefs_data_ops,
bch2_data_thread);
if (ret < 0)
@ -857,7 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
goto err;
}
ret = run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
err:
if (ret < 0) {
bch_err_fn(c, ret);

View File

@ -1,13 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#include <linux/darray.h>
#include <linux/log2.h>
#include <linux/slab.h>
#include "darray.h"
int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);

View File

@ -1,26 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#ifndef _LINUX_DARRAY_H
#define _LINUX_DARRAY_H
#ifndef _BCACHEFS_DARRAY_H
#define _BCACHEFS_DARRAY_H
/*
* Dynamic arrays
* Dynamic arrays:
*
* Inspired by CCAN's darray
*/
#include <linux/darray_types.h>
#include <linux/slab.h>
int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
#define DARRAY_PREALLOCATED(_type, _nr) \
struct { \
size_t nr, size; \
_type *data; \
_type preallocated[_nr]; \
}
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
static inline int __darray_resize(darray_char *d, size_t element_size,
size_t new_size, gfp_t gfp)
{
return unlikely(new_size > d->size)
? __darray_resize_slowpath(d, element_size, new_size, gfp)
? __bch2_darray_resize(d, element_size, new_size, gfp)
: 0;
}
@ -61,28 +69,6 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
#define darray_first(_d) ((_d).data[0])
#define darray_last(_d) ((_d).data[(_d).nr - 1])
/* Insert/remove items into the middle of a darray: */
#define array_insert_item(_array, _nr, _pos, _new_item) \
do { \
memmove(&(_array)[(_pos) + 1], \
&(_array)[(_pos)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
(_nr)++; \
(_array)[(_pos)] = (_new_item); \
} while (0)
#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
do { \
(_nr) -= (_nr_to_remove); \
memmove(&(_array)[(_pos)], \
&(_array)[(_pos) + (_nr_to_remove)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
} while (0)
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
#define darray_insert_item(_d, pos, _item) \
({ \
size_t _pos = (pos); \
@ -93,15 +79,10 @@ do { \
_ret; \
})
#define darray_remove_items(_d, _pos, _nr_to_remove) \
array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
#define darray_remove_item(_d, _pos) \
darray_remove_items(_d, _pos, 1)
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
/* Iteration: */
#define __darray_for_each(_d, _i) \
#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each(_d, _i) \
@ -125,4 +106,4 @@ do { \
darray_init(_d); \
} while (0)
#endif /* _LINUX_DARRAY_H */
#endif /* _BCACHEFS_DARRAY_H */

View File

@ -5,6 +5,10 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
x(EINVAL, mount_option) \
x(BCH_ERR_mount_option, option_name) \
x(BCH_ERR_mount_option, option_value) \
x(BCH_ERR_mount_option, option_not_bool) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@ -247,7 +251,8 @@
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem)
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, need_inode_lock)
enum bch_errcode {
BCH_ERR_START = 2048,

View File

@ -3,7 +3,7 @@
#include "error.h"
#include "recovery.h"
#include "super.h"
#include <linux/thread_with_file.h>
#include "thread_with_file.h"
#define FSCK_ERR_RATELIMIT_NR 10
@ -111,7 +111,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
do {
bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
if (r < 0)
return YN_NO;
buf[r] = '\0';

View File

@ -43,6 +43,11 @@ enum bkey_invalid_flags;
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
#define extent_entry_next_safe(_entry, _end) \
(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
? extent_entry_next(_entry) \
: _end)
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
(_entry) < (_end); \
(_entry) = extent_entry_next(_entry))
(_entry) = extent_entry_next_safe(_entry, _end))
#define __bkey_ptr_next(_ptr, _end) \
({ \
@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
(_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (extent_entry_type(_entry)) { \
switch (__extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
(_ptr).ptr = _entry->ptr; \
goto out; \
@ -344,7 +349,7 @@ out: \
for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
(_entry) = _start; \
__bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
(_entry) = extent_entry_next(_entry))
(_entry) = extent_entry_next_safe(_entry, _end))
#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \

View File

@ -1,37 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_EYTZINGER_H
#define _LINUX_EYTZINGER_H
#ifndef _EYTZINGER_H
#define _EYTZINGER_H
#include <linux/bitops.h>
#include <linux/log2.h>
#ifdef EYTZINGER_DEBUG
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
#endif
#include "util.h"
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
* array.
* array
*/
/*
* One based indexing version:
*
* Consider using an eytzinger tree any time you would otherwise be doing binary
* search over an array. Binary search is a worst case scenario for branch
* prediction and prefetching, but in an eytzinger tree every node's children
* are adjacent in memory, thus we can prefetch children before knowing the
* result of the comparison, assuming multiple nodes fit on a cacheline.
*
* Two variants are provided, for one based indexing and zero based indexing.
*
* Zero based indexing is more convenient, but one based indexing has better
* alignment and thus better performance because each new level of the tree
* starts at a power of two, and thus if element 0 was cacheline aligned, each
* new level will be as well.
* With one based indexing each level of the tree starts at a power of two -
* good for cacheline alignment:
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
EYTZINGER_BUG_ON(child > 1);
EBUG_ON(child > 1);
return (i << 1) + child;
}
@ -68,7 +58,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
EYTZINGER_BUG_ON(i > size);
EBUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@ -84,7 +74,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
EYTZINGER_BUG_ON(i > size);
EBUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@ -111,7 +101,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
EYTZINGER_BUG_ON(!i || i > size);
EBUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@ -136,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
EYTZINGER_BUG_ON(!i || i > size);
EBUG_ON(!i || i > size);
/*
* sign bit trick:
@ -174,7 +164,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
EYTZINGER_BUG_ON(child > 1);
EBUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@ -241,9 +231,11 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
eytzinger_cmp_fn cmp, const void *search)
{
unsigned i, n = 0;
@ -252,7 +244,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
} while (n < nr);
if (n & 1) {
@ -277,13 +269,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \
\
while (_i < _nr && \
(_res = _cmp(_search, _base + _i * _size))) \
(_res = _cmp(_search, _base + _i * _size, _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
void eytzinger0_sort_r(void *, size_t, size_t,
cmp_r_func_t, swap_r_func_t, const void *);
void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
void eytzinger0_sort(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
#endif /* _LINUX_EYTZINGER_H */
#endif /* _EYTZINGER_H */

View File

@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
loff_t pos, unsigned len)
loff_t pos, unsigned len,
bool inode_locked)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
/*
* If we're not using the inode lock, we need to lock all the folios for
* atomiticity of writes vs. other writes:
*/
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
ret = -BCH_ERR_need_inode_lock;
goto out;
}
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
if (end > inode->v.i_size)
if (end > inode->v.i_size) {
BUG_ON(!inode_locked);
i_size_write(&inode->v, end);
}
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
loff_t pos = iocb->ki_pos;
ssize_t written = 0;
int ret = 0;
loff_t pos;
bool inode_locked = false;
ssize_t written = 0, written2 = 0, ret = 0;
/*
* We don't take the inode lock unless i_size will be changing. Folio
* locks provide exclusion with other writes, and the pagecache add lock
* provides exclusion with truncate and hole punching.
*
* There is one nasty corner case where atomicity would be broken
* without great care: when copying data from userspace to the page
* cache, we do that with faults disable - a page fault would recurse
* back into the filesystem, taking filesystem locks again, and
* deadlock; so it's done with faults disabled, and we fault in the user
* buffer when we aren't holding locks.
*
* If we do part of the write, but we then race and in the userspace
* buffer have been evicted and are no longer resident, then we have to
* drop our folio locks to re-fault them in, breaking write atomicity.
*
* To fix this, we restart the write from the start, if we weren't
* holding the inode lock.
*
* There is another wrinkle after that; if we restart the write from the
* start, and then get an unrecoverable error, we _cannot_ claim to
* userspace that we did not write data we actually did - so we must
* track (written2) the most we ever wrote.
*/
if ((iocb->ki_flags & IOCB_APPEND) ||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
inode_lock(&inode->v);
inode_locked = true;
}
ret = generic_write_checks(iocb, iter);
if (ret <= 0)
goto unlock;
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
if (ret) {
if (!inode_locked) {
inode_lock(&inode->v);
inode_locked = true;
ret = file_remove_privs_flags(file, 0);
}
if (ret)
goto unlock;
}
ret = file_update_time(file);
if (ret)
goto unlock;
pos = iocb->ki_pos;
bch2_pagecache_add_get(inode);
if (!inode_locked &&
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
goto get_inode_lock;
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@ -1004,12 +1072,17 @@ again:
}
}
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
goto get_inode_lock;
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
if (ret == -BCH_ERR_need_inode_lock)
goto get_inode_lock;
if (unlikely(ret < 0))
break;
@ -1030,50 +1103,46 @@ again:
}
pos += ret;
written += ret;
written2 = max(written, written2);
if (ret != bytes && !inode_locked)
goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
if (0) {
get_inode_lock:
bch2_pagecache_add_put(inode);
inode_lock(&inode->v);
inode_locked = true;
bch2_pagecache_add_get(inode);
iov_iter_revert(iter, written);
pos -= written;
written = 0;
ret = 0;
}
} while (iov_iter_count(iter));
bch2_pagecache_add_put(inode);
return written ? written : ret;
}
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
ssize_t ret;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = bch2_direct_write(iocb, from);
goto out;
}
inode_lock(&inode->v);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto unlock;
ret = file_remove_privs(file);
if (ret)
goto unlock;
ret = file_update_time(file);
if (ret)
goto unlock;
ret = bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
unlock:
inode_unlock(&inode->v);
if (inode_locked)
inode_unlock(&inode->v);
iocb->ki_pos += written;
ret = max(written, written2) ?: ret;
if (ret > 0)
ret = generic_write_sync(iocb, ret);
out:
return ret;
}
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
? bch2_direct_write(iocb, iter)
: bch2_buffered_write(iocb, iter);
return bch2_err_class(ret);
}

View File

@ -51,13 +51,10 @@ enum bch_folio_sector_state {
struct bch_folio_sector {
/* Uncompressed, fully allocated replicas (or on disk reservation): */
unsigned nr_replicas:4;
u8 nr_replicas:4,
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
unsigned replicas_reserved:4;
/* i_sectors: */
enum bch_folio_sector_state state:8;
replicas_reserved:4;
u8 state;
};
struct bch_folio {

View File

@ -1870,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
ret = bch2_parse_mount_opts(NULL, &opts, data);
if (ret)
if (ret) {
ret = bch2_err_class(ret);
return ERR_PTR(ret);
}
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);

View File

@ -5,6 +5,7 @@
#include "btree_cache.h"
#include "btree_update.h"
#include "buckets.h"
#include "darray.h"
#include "dirent.h"
#include "error.h"
#include "fs-common.h"
@ -17,7 +18,6 @@
#include "xattr.h"
#include <linux/bsearch.h>
#include <linux/darray.h>
#include <linux/dcache.h> /* struct qstr */
/*
@ -849,12 +849,9 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
int ret = bkey_err(k);
if (ret)
return ret;
int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
bch2_trans_iter_exit(trans, &iter);
return k.k->type == KEY_TYPE_set;
return ret;
}
static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
@ -970,7 +967,7 @@ static int check_inode(struct btree_trans *trans,
if (ret < 0)
return ret;
fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot);
ret = 0;

View File

@ -1181,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bool need_another_pass;
int ret;
again:
/*
* if we ran check_inodes() unlinked inodes will have already been
* cleaned up but the write buffer will be out of sync; therefore we
* alway need a write buffer flush
*/
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
need_another_pass = false;
/*
@ -1213,12 +1222,8 @@ again:
ret;
}));
if (!ret && need_another_pass) {
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
if (!ret && need_another_pass)
goto again;
}
err:
bch2_trans_put(trans);
return ret;

View File

@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.op.c;
time_stats_update(&c->times[BCH_TIME_data_promote],
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
promote_free(c, op);
}
@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;
@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
if (rbio->start_time)
time_stats_update(&rbio->c->times[BCH_TIME_data_read],
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time);
bio_endio(&rbio->bio);
}

View File

@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
bch2_congested_acct(ca, io_latency, now, rw);
__time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
EBUG_ON(op->open_buckets.nr);
time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_disk_reservation_put(c, &op->res);
if (!(op->flags & BCH_WRITE_MOVE))

View File

@ -84,12 +84,8 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
prt_str(out, "separate_flush ");
if (buf->need_flush_to_write_buffer)
prt_str(out, "need_flush_to_write_buffer ");
if (buf->need_flush_to_write_buffer)
prt_str(out, "need_flush_to_write_buffer ");
if (buf->write_done)
prt_str(out, "write done ");
if (buf->write_started)
prt_str(out, "write started ");
prt_str(out, "write_started ");
if (buf->write_allocated)
prt_str(out, "write allocated ");
if (buf->write_done)
@ -715,7 +711,7 @@ recheck_need_open:
return ret;
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf = journal_seq_to_buf(j, seq);
buf->must_flush = true;
if (!buf->flush_time) {
@ -733,8 +729,8 @@ recheck_need_open:
}
/*
* if write was kicked off without a flush, flush the next sequence
* number instead
* if write was kicked off without a flush, or if we promised it
* wouldn't be a flush, flush the next sequence number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
@ -768,7 +764,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
if (!ret)
time_stats_update(j->flush_seq_time, start_time);
bch2_time_stats_update(j->flush_seq_time, start_time);
return ret ?: ret2 < 0 ? ret2 : 0;
}
@ -812,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
/* journal write is already in flight, and was a flush write: */
if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
/* journal flush already in flight, or flush requseted */
if (buf->must_flush)
goto out;
buf->noflush = true;
@ -1203,7 +1199,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
last_seq = le64_to_cpu(i->j.last_seq);
@ -1236,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);

View File

@ -86,9 +86,12 @@ static void __journal_replay_free(struct bch_fs *c,
kvfree(i);
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
{
i->ignore = true;
if (blacklisted)
i->ignore_blacklisted = true;
else
i->ignore_not_dirty = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(c, i);
@ -138,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
if (le64_to_cpu(i->j.seq) >= last_seq)
break;
journal_replay_free(c, i);
journal_replay_free(c, i, false);
}
}
@ -199,8 +203,9 @@ replace:
return -BCH_ERR_ENOMEM_journal_entry_add;
darray_init(&i->ptrs);
i->csum_good = entry_ptr.csum_good;
i->ignore = false;
i->csum_good = entry_ptr.csum_good;
i->ignore_blacklisted = false;
i->ignore_not_dirty = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
if (dup) {
@ -1255,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c,
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
if (!*start_seq)
*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
if (JSET_NO_FLUSH(&i->j)) {
i->ignore = true;
i->ignore_blacklisted = true;
continue;
}
if (!last_write_torn && !i->csum_good) {
last_write_torn = true;
i->ignore = true;
i->ignore_blacklisted = true;
continue;
}
@ -1307,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < *last_seq) {
journal_replay_free(c, i);
journal_replay_free(c, i, false);
continue;
}
@ -1320,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
i->ignore = true;
i->ignore_blacklisted = true;
}
}
@ -1329,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@ -1382,7 +1387,7 @@ int bch2_journal_read(struct bch_fs *c,
};
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
darray_for_each(i->ptrs, ptr) {
@ -1602,9 +1607,9 @@ static CLOSURE_CALLBACK(journal_write_done)
u64 v, seq = le64_to_cpu(w->data->seq);
int err = 0;
time_stats_update(!JSET_NO_FLUSH(w->data)
? j->flush_write_time
: j->noflush_write_time, j->write_start_time);
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
? j->flush_write_time
: j->noflush_write_time, j->write_start_time);
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
@ -1667,6 +1672,7 @@ static CLOSURE_CALLBACK(journal_write_done)
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
closure_wake_up(&w->wait);
completed = true;
}
@ -1676,7 +1682,6 @@ static CLOSURE_CALLBACK(journal_write_done)
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
closure_wake_up(&w->wait);
journal_wake(j);
}
@ -1930,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
j->nr_noflush_writes++;
} else {
w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
#include <linux/darray_types.h>
#include "darray.h"
struct journal_ptr {
bool csum_good;
@ -20,11 +20,17 @@ struct journal_replay {
DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
bool ignore;
bool ignore_blacklisted;
bool ignore_not_dirty;
/* must be last: */
struct jset j;
};
static inline bool journal_replay_ignore(struct journal_replay *i)
{
return !i || i->ignore_blacklisted || i->ignore_not_dirty;
}
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{

View File

@ -2,8 +2,8 @@
#include "bcachefs.h"
#include "journal_sb.h"
#include "darray.h"
#include <linux/darray.h>
#include <linux/sort.h>
/* BCH_SB_FIELD_journal: */

View File

@ -2,11 +2,10 @@
#include "bcachefs.h"
#include "btree_iter.h"
#include "eytzinger.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <linux/eytzinger.h>
/*
* journal_seq_blacklist machinery:
*
@ -44,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
static struct bch_sb_field_journal_seq_blacklist *
blacklist_entry_try_merge(struct bch_fs *c,
struct bch_sb_field_journal_seq_blacklist *bl,
unsigned i)
{
unsigned nr = blacklist_nr_entries(bl);
if (le64_to_cpu(bl->start[i].end) >=
le64_to_cpu(bl->start[i + 1].start)) {
bl->start[i].end = bl->start[i + 1].end;
--nr;
memmove(&bl->start[i],
&bl->start[i + 1],
sizeof(bl->start[0]) * (nr - i));
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
sb_blacklist_u64s(nr));
BUG_ON(!bl);
}
return bl;
}
static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
u64 start, u64 end)
{
return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
}
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
unsigned i, nr;
unsigned i = 0, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
for (i = 0; i < nr; i++) {
while (i < nr) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
if (bl_entry_contig_or_overlaps(e, start, end)) {
e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
if (end < le64_to_cpu(e->start))
break;
if (i + 1 < nr)
bl = blacklist_entry_try_merge(c,
bl, i);
if (i)
bl = blacklist_entry_try_merge(c,
bl, i - 1);
goto out_write_sb;
if (start > le64_to_cpu(e->end)) {
i++;
continue;
}
/*
* Entry is contiguous or overlapping with new entry: merge it
* with new entry, and delete:
*/
start = min(start, le64_to_cpu(e->start));
end = max(end, le64_to_cpu(e->end));
array_remove_item(bl->start, nr, i);
}
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@ -108,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
goto out;
}
bl->start[nr].start = cpu_to_le64(start);
bl->start[nr].end = cpu_to_le64(end);
out_write_sb:
array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
.start = cpu_to_le64(start),
.end = cpu_to_le64(end),
}));
c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
@ -120,7 +95,8 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
static int journal_seq_blacklist_table_cmp(const void *_l,
const void *_r, size_t size)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
if (!bl)
return 0;
t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
GFP_KERNEL);
t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
return -BCH_ERR_ENOMEM_blacklist_table_init;

View File

@ -287,9 +287,9 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
struct time_stats *flush_write_time;
struct time_stats *noflush_write_time;
struct time_stats *flush_seq_time;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map res_map;

View File

@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out;
}
if (c->opts.reconstruct_alloc ||
fsck_err(c, lru_entry_bad,
if (fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",

View File

@ -40,9 +40,10 @@
#include <linux/limits.h>
#include <linux/math.h>
#include <linux/math64.h>
#include <linux/mean_and_variance.h>
#include <linux/module.h>
#include "mean_and_variance.h"
u128_u u128_div(u128_u n, u64 d)
{
u128_u r;

View File

@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
u64 start_time = local_clock();
__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
}
}

View File

@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c,
if (ret < 0 || (*res != 0 && *res != 1)) {
if (err)
prt_printf(err, "%s: must be bool", opt->attr.name);
return ret;
return ret < 0 ? ret : -BCH_ERR_option_not_bool;
}
break;
case BCH_OPT_UINT:
@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts = kstrdup(options, GFP_KERNEL);
if (!copied_opts)
return -1;
return -ENOMEM;
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
bad_opt:
pr_err("Bad mount option %s", name);
ret = -1;
ret = -BCH_ERR_option_name;
goto out;
bad_val:
pr_err("Invalid mount option %s", err.buf);
ret = -1;
ret = -BCH_ERR_option_value;
goto out;
out:
kfree(copied_opts_start);

View File

@ -290,6 +290,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
x(no_splitbrain_check, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \

View File

@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id)
}
/* for -o reconstruct_alloc: */
static void drop_alloc_keys(struct journal_keys *keys)
static void do_reconstruct_alloc(struct bch_fs *c)
{
bch2_journal_log_msg(c, "dropping alloc info");
bch_info(c, "dropping and reconstructing all alloc info");
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
struct journal_keys *keys = &c->journal_keys;
size_t src, dst;
for (src = 0, dst = 0; src < keys->nr; src++)
if (!btree_id_is_alloc(keys->data[src].btree_id))
keys->data[dst++] = keys->data[src];
keys->nr = dst;
}
@ -122,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (ret)
goto out;
struct btree_path *path = btree_iter_path(trans, &iter);
if (unlikely(!btree_path_node(path, k->level))) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
-BCH_ERR_transaction_restart_nested;
goto out;
}
/* Must be checked with btree locked: */
if (k->overwritten)
goto out;
@ -355,7 +399,7 @@ static int journal_replay_early(struct bch_fs *c,
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
vstruct_for_each(&i->j, entry) {
@ -384,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c)
if (!r->alive)
continue;
if (btree_id_is_alloc(i) &&
c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
}
if (r->error) {
__fsck_err(c,
@ -857,7 +898,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto out;
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i && !(*i)->ignore) {
if (!journal_replay_ignore(*i)) {
last_journal_entry = &(*i)->j;
break;
}
@ -882,7 +923,8 @@ int bch2_fs_recovery(struct bch_fs *c)
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i) {
last_journal_entry = &(*i)->j;
(*i)->ignore = false;
(*i)->ignore_blacklisted = false;
(*i)->ignore_not_dirty= false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
@ -918,10 +960,8 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
if (c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
drop_alloc_keys(&c->journal_keys);
}
if (c->opts.reconstruct_alloc)
do_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@ -945,7 +985,7 @@ use_clean:
bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
goto err;
}
}
@ -956,9 +996,6 @@ use_clean:
if (ret)
goto err;
if (c->opts.reconstruct_alloc)
bch2_journal_log_msg(c, "dropping alloc info");
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:

View File

@ -6,15 +6,12 @@
#include "replicas.h"
#include "super-io.h"
#include <linux/sort.h>
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
static int bch2_memcmp(const void *l, const void *r, const void *priv)
static int bch2_memcmp(const void *l, const void *r, size_t size)
{
size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@ -42,8 +39,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@ -232,7 +228,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
verify_replicas_entry(search);
#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search);
#undef entry_cmp
@ -828,11 +824,10 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
sort_r(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
bch2_memcmp, NULL,
(void *)(size_t)cpu_r->entry_size);
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =

View File

@ -3,10 +3,9 @@
#define _BCACHEFS_REPLICAS_H
#include "bkey.h"
#include "eytzinger.h"
#include "replicas_types.h"
#include <linux/eytzinger.h>
void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);

View File

@ -6,13 +6,12 @@
*/
#include "bcachefs.h"
#include "darray.h"
#include "recovery.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
#include <linux/darray.h>
#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
/*
@ -260,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
if (e < BCH_SB_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
__set_bit_le64(e, ext->errors_silent);
}
}
}

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
#define _BCACHEFS_SB_ERRORS_TYPES_H
#include <linux/darray_types.h>
#include "darray.h"
#define BCH_SB_ERRS() \
x(clean_but_journal_not_empty, 0) \
@ -264,7 +264,8 @@
x(subvol_children_not_set, 256) \
x(subvol_children_bad, 257) \
x(subvol_loop, 258) \
x(subvol_unreachable, 259)
x(subvol_unreachable, 259) \
x(btree_node_bkey_bad_u64s, 260)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
#include <linux/darray.h>
#include "darray.h"
extern char * const bch2_member_error_strs[];

View File

@ -2,6 +2,7 @@
#ifndef _BCACHEFS_SUBVOLUME_H
#define _BCACHEFS_SUBVOLUME_H
#include "darray.h"
#include "subvolume_types.h"
enum bkey_invalid_flags;

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
#include <linux/darray_types.h>
#include "darray.h"
typedef DARRAY(u32) snapshot_id_list;

View File

@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return ret;
}
if (rw == WRITE &&
bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
le64_to_cpu(sb->seq));
return -BCH_ERR_invalid_sb_members_missing;
}
return 0;
}

View File

@ -3,12 +3,12 @@
#define _BCACHEFS_SUPER_IO_H
#include "extents.h"
#include "eytzinger.h"
#include "super_types.h"
#include "super.h"
#include "sb-members.h"
#include <asm/byteorder.h>
#include <linux/eytzinger.h>
static inline bool bch2_version_compatible(u16 version)
{

View File

@ -56,6 +56,7 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
#include "thread_with_file.h"
#include "trace.h"
#include <linux/backing-dev.h>
@ -67,7 +68,6 @@
#include <linux/percpu.h>
#include <linux/random.h>
#include <linux/sysfs.h>
#include <linux/thread_with_file.h>
#include <crypto/hash.h>
MODULE_LICENSE("GPL");
@ -87,20 +87,27 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
{
#ifdef __KERNEL__
if (unlikely(stdio)) {
if (fmt[0] == KERN_SOH[0])
fmt += 2;
bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
return;
}
#endif
vprintk(fmt, args);
}
void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
{
struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
va_list args;
va_start(args, fmt);
if (likely(!stdio)) {
vprintk(fmt, args);
} else {
if (fmt[0] == KERN_SOH[0])
fmt += 2;
stdio_redirect_vprintf(stdio, true, fmt, args);
}
bch2_print_maybe_redirect(stdio, fmt, args);
va_end(args);
}
@ -110,14 +117,7 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
va_list args;
va_start(args, fmt);
if (likely(!stdio)) {
vprintk(fmt, args);
} else {
if (fmt[0] == KERN_SOH[0])
fmt += 2;
stdio_redirect_vprintf(stdio, true, fmt, args);
}
bch2_print_maybe_redirect(stdio, fmt, args);
va_end(args);
}
@ -532,7 +532,7 @@ static void __bch2_fs_free(struct bch_fs *c)
unsigned i;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
time_stats_exit(&c->times[i]);
bch2_time_stats_exit(&c->times[i]);
bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
@ -765,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal_keys.initial_ref_held = true;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
time_stats_init(&c->times[i]);
bch2_time_stats_init(&c->times[i]);
bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@ -830,13 +830,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
pr_uuid(&name, c->sb.user_uuid.b);
strscpy(c->name, name.buf, sizeof(c->name));
printbuf_exit(&name);
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
goto err;
strscpy(c->name, name.buf, sizeof(c->name));
printbuf_exit(&name);
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@ -1073,7 +1073,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
}
static int bch2_dev_in_fs(struct bch_sb_handle *fs,
struct bch_sb_handle *sb)
struct bch_sb_handle *sb,
struct bch_opts *opts)
{
if (fs == sb)
return 0;
@ -1114,11 +1115,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
prt_newline(&buf);
prt_printf(&buf, "Not using older sb");
if (!opts->no_splitbrain_check)
prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_device_splitbrain;
if (!opts->no_splitbrain_check)
return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@ -1141,12 +1145,17 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
prt_str(&buf, "Not using ");
prt_bdevname(&buf, sb->bdev);
if (!opts->no_splitbrain_check) {
prt_str(&buf, "Not using ");
prt_bdevname(&buf, sb->bdev);
}
pr_err("%s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_device_splitbrain;
if (!opts->no_splitbrain_check)
return -BCH_ERR_device_splitbrain;
}
return 0;
@ -1180,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
time_stats_quantiles_exit(&ca->io_latency[WRITE]);
time_stats_quantiles_exit(&ca->io_latency[READ]);
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@ -1272,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
time_stats_quantiles_init(&ca->io_latency[READ]);
time_stats_quantiles_init(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
@ -1847,7 +1856,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
ret = bch2_dev_in_fs(&c->disk_sb, &sb);
ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
@ -2035,7 +2044,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
ret = bch2_dev_in_fs(best, sb);
ret = bch2_dev_in_fs(best, sb, &opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {

View File

@ -0,0 +1,450 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
#include <linux/sched/sysctl.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
if (thr->task) {
kthread_stop(thr->task);
put_task_struct(thr->task);
}
}
int bch2_run_thread_with_file(struct thread_with_file *thr,
const struct file_operations *fops,
int (*fn)(void *))
{
struct file *file = NULL;
int ret, fd = -1;
unsigned fd_flags = O_CLOEXEC;
if (fops->read && fops->write)
fd_flags |= O_RDWR;
else if (fops->read)
fd_flags |= O_RDONLY;
else if (fops->write)
fd_flags |= O_WRONLY;
char name[TASK_COMM_LEN];
get_task_comm(name, current);
thr->ret = 0;
thr->task = kthread_create(fn, thr, "%s", name);
ret = PTR_ERR_OR_ZERO(thr->task);
if (ret)
return ret;
ret = get_unused_fd_flags(fd_flags);
if (ret < 0)
goto err;
fd = ret;
file = anon_inode_getfile(name, fops, thr, fd_flags);
ret = PTR_ERR_OR_ZERO(file);
if (ret)
goto err;
get_task_struct(thr->task);
wake_up_process(thr->task);
fd_install(fd, file);
return fd;
err:
if (fd >= 0)
put_unused_fd(fd);
if (thr->task)
kthread_stop(thr->task);
return ret;
}
/* stdio_redirect */
static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
return stdio->input.buf.nr || stdio->done;
}
static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
{
return stdio->output.buf.nr || stdio->done;
}
#define STDIO_REDIRECT_BUFSIZE 4096
static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
{
return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
}
static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
{
return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
}
static void stdio_buf_init(struct stdio_buf *buf)
{
spin_lock_init(&buf->lock);
init_waitqueue_head(&buf->wait);
darray_init(&buf->buf);
}
/* thread_with_stdio */
static void thread_with_stdio_done(struct thread_with_stdio *thr)
{
thr->thr.done = true;
thr->stdio.done = true;
wake_up(&thr->stdio.input.wait);
wake_up(&thr->stdio.output.wait);
}
static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
struct stdio_buf *buf = &thr->stdio.output;
size_t copied = 0, b;
int ret = 0;
if (!(file->f_flags & O_NONBLOCK)) {
ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
if (ret)
return ret;
} else if (!stdio_redirect_has_output(&thr->stdio))
return -EAGAIN;
while (len && buf->buf.nr) {
if (fault_in_writeable(ubuf, len) == len) {
ret = -EFAULT;
break;
}
spin_lock_irq(&buf->lock);
b = min_t(size_t, len, buf->buf.nr);
if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
ubuf += b;
len -= b;
copied += b;
buf->buf.nr -= b;
memmove(buf->buf.data,
buf->buf.data + b,
buf->buf.nr);
}
spin_unlock_irq(&buf->lock);
}
return copied ?: ret;
}
static int thread_with_stdio_release(struct inode *inode, struct file *file)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
thread_with_stdio_done(thr);
bch2_thread_with_file_exit(&thr->thr);
darray_exit(&thr->stdio.input.buf);
darray_exit(&thr->stdio.output.buf);
thr->ops->exit(thr);
return 0;
}
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
struct stdio_buf *buf = &thr->stdio.input;
size_t copied = 0;
ssize_t ret = 0;
while (len) {
if (thr->thr.done) {
ret = -EPIPE;
break;
}
size_t b = len - fault_in_readable(ubuf, len);
if (!b) {
ret = -EFAULT;
break;
}
spin_lock(&buf->lock);
if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
darray_make_room_gfp(&buf->buf,
min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
b = min(len, darray_room(buf->buf));
if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
buf->buf.nr += b;
ubuf += b;
len -= b;
copied += b;
}
spin_unlock(&buf->lock);
if (b) {
wake_up(&buf->wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
ret = wait_event_interruptible(buf->wait,
stdio_redirect_has_input_space(&thr->stdio));
if (ret)
break;
}
}
return copied ?: ret;
}
static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
poll_wait(file, &thr->stdio.output.wait, wait);
poll_wait(file, &thr->stdio.input.wait, wait);
__poll_t mask = 0;
if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
if (stdio_redirect_has_input_space(&thr->stdio))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
poll_wait(file, &thr->stdio.output.wait, wait);
__poll_t mask = 0;
if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
return thr->thr.ret;
}
static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
if (thr->ops->unlocked_ioctl)
return thr->ops->unlocked_ioctl(thr, cmd, p);
return -ENOTTY;
}
static const struct file_operations thread_with_stdio_fops = {
.llseek = no_llseek,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
.flush = thread_with_stdio_flush,
.release = thread_with_stdio_release,
.unlocked_ioctl = thread_with_stdio_ioctl,
};
static const struct file_operations thread_with_stdout_fops = {
.llseek = no_llseek,
.read = thread_with_stdio_read,
.poll = thread_with_stdout_poll,
.flush = thread_with_stdio_flush,
.release = thread_with_stdio_release,
.unlocked_ioctl = thread_with_stdio_ioctl,
};
static int thread_with_stdio_fn(void *arg)
{
struct thread_with_stdio *thr = arg;
thr->thr.ret = thr->ops->fn(thr);
thread_with_stdio_done(thr);
return 0;
}
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
}
int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
}
EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
struct stdio_buf *buf = &stdio->input;
/*
* we're waiting on user input (or for the file descriptor to be
* closed), don't want a hung task warning:
*/
do {
wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
sysctl_hung_task_timeout_secs * HZ / 2);
} while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
spin_lock(&buf->lock);
int ret = min(len, buf->buf.nr);
buf->buf.nr -= ret;
memcpy(ubuf, buf->buf.data, ret);
memmove(buf->buf.data,
buf->buf.data + ret,
buf->buf.nr);
spin_unlock(&buf->lock);
wake_up(&buf->wait);
return ret;
}
int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
struct stdio_buf *buf = &stdio->input;
size_t copied = 0;
ssize_t ret = 0;
again:
do {
wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
sysctl_hung_task_timeout_secs * HZ / 2);
} while (!stdio_redirect_has_input(stdio));
if (stdio->done) {
ret = -1;
goto out;
}
spin_lock(&buf->lock);
size_t b = min(len, buf->buf.nr);
char *n = memchr(buf->buf.data, '\n', b);
if (n)
b = min_t(size_t, b, n + 1 - buf->buf.data);
buf->buf.nr -= b;
memcpy(ubuf, buf->buf.data, b);
memmove(buf->buf.data,
buf->buf.data + b,
buf->buf.nr);
ubuf += b;
len -= b;
copied += b;
spin_unlock(&buf->lock);
wake_up(&buf->wait);
if (!n && len)
goto again;
out:
return copied ?: ret;
}
__printf(3, 0)
static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
{
ssize_t ret;
do {
va_list args2;
size_t len;
va_copy(args2, args);
len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
va_end(args2);
if (len + 1 <= darray_room(*out)) {
out->nr += len;
return len;
}
ret = darray_make_room_gfp(out, len + 1, gfp);
} while (ret == 0);
return ret;
}
ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
const char *fmt, va_list args)
{
struct stdio_buf *buf = &stdio->output;
unsigned long flags;
ssize_t ret;
again:
spin_lock_irqsave(&buf->lock, flags);
ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
spin_unlock_irqrestore(&buf->lock, flags);
if (ret < 0) {
if (nonblocking)
return -EAGAIN;
ret = wait_event_interruptible(buf->wait,
stdio_redirect_has_output_space(stdio));
if (ret)
return ret;
goto again;
}
wake_up(&buf->wait);
return ret;
}
ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
const char *fmt, ...)
{
va_list args;
ssize_t ret;
va_start(args, fmt);
ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
va_end(args);
return ret;
}
#endif /* NO_BCACHEFS_FS */

View File

@ -0,0 +1,76 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_THREAD_WITH_FILE_H
#define _BCACHEFS_THREAD_WITH_FILE_H
#include "thread_with_file_types.h"
/*
* Thread with file: Run a kthread and connect it to a file descriptor, so that
* it can be interacted with via fd read/write methods and closing the file
* descriptor stops the kthread.
*
* We have two different APIs:
*
* thread_with_file, the low level version.
* You get to define the full file_operations, including your release function,
* which means that you must call bch2_thread_with_file_exit() from your
* .release method
*
* thread_with_stdio, the higher level version
* This implements full piping of input and output, including .poll.
*
* Notes on behaviour:
* - kthread shutdown behaves like writing or reading from a pipe that has been
* closed
* - Input and output buffers are 4096 bytes, although buffers may in some
* situations slightly exceed that limit so as to avoid chopping off a
* message in the middle in nonblocking mode.
* - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
* should be fine but might change in future revisions.
* - Output buffer may grow past 4096 bytes to deal with messages that are
* bigger than 4096 bytes
* - Writing may be done blocking or nonblocking; in nonblocking mode, we only
* drop entire messages.
*
* To write, use stdio_redirect_printf()
* To read, use stdio_redirect_read() or stdio_redirect_readline()
*/
struct task_struct;
struct thread_with_file {
struct task_struct *task;
int ret;
bool done;
};
void bch2_thread_with_file_exit(struct thread_with_file *);
int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
struct thread_with_stdio;
struct thread_with_stdio_ops {
void (*exit)(struct thread_with_stdio *);
int (*fn)(struct thread_with_stdio *);
long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
};
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
const struct thread_with_stdio_ops *ops;
};
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_run_thread_with_stdout(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */

View File

@ -0,0 +1,23 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#include "darray.h"
struct stdio_buf {
spinlock_t lock;
wait_queue_head_t wait;
darray_char buf;
};
struct stdio_redirect {
struct stdio_buf input;
struct stdio_buf output;
spinlock_t input_lock;
wait_queue_head_t input_wait;
darray_char input_buf;
bool done;
};
#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */

165
libbcachefs/time_stats.c Normal file
View File

@ -0,0 +1,165 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/time.h>
#include <linux/spinlock.h>
#include "eytzinger.h"
#include "time_stats.h"
static const struct time_unit time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "d", (u64) NSEC_PER_SEC * 3600 * 24},
{ "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
{ "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
{ "eon", U64_MAX },
};
const struct time_unit *bch2_pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
static void quantiles_update(struct quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void time_stats_update_one(struct bch2_time_stats *stats,
u64 start, u64 end)
{
u64 duration, freq;
bool initted = stats->last_event != 0;
if (time_after64(end, start)) {
struct quantiles *quantiles = time_stats_to_quantiles(stats);
duration = end - start;
mean_and_variance_update(&stats->duration_stats, duration);
mean_and_variance_weighted_update(&stats->duration_stats_weighted,
duration, initted, TIME_STATS_MV_WEIGHT);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
if (quantiles)
quantiles_update(quantiles, duration);
}
if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted,
freq, initted, TIME_STATS_MV_WEIGHT);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
}
stats->last_event = end;
}
void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct time_stat_buffer *b)
{
for (struct time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
time_stats_update_one(stats, i->start, i->end);
b->nr = 0;
}
static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
struct time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
time_stats_update_one(stats, start, end);
if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct time_stat_buffer_entry) {
.start = start,
.end = end
};
if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
}
void bch2_time_stats_init(struct bch2_time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
stats->min_duration = U64_MAX;
stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}

View File

@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* time_stats - collect statistics on events that have a duration, with nicely
* bch2_time_stats - collect statistics on events that have a duration, with nicely
* formatted textual output on demand
*
* - percpu buffering of event collection: cheap enough to shotgun
@ -21,14 +21,15 @@
*
* Particularly useful for tracking down latency issues.
*/
#ifndef _LINUX_TIME_STATS_H
#define _LINUX_TIME_STATS_H
#ifndef _BCACHEFS_TIME_STATS_H
#define _BCACHEFS_TIME_STATS_H
#include <linux/mean_and_variance.h>
#include <linux/sched/clock.h>
#include <linux/spinlock_types.h>
#include <linux/string.h>
#include "mean_and_variance.h"
struct time_unit {
const char *name;
u64 nsecs;
@ -37,12 +38,12 @@ struct time_unit {
/*
* given a nanosecond value, pick the preferred time units for printing:
*/
const struct time_unit *pick_time_units(u64 ns);
const struct time_unit *bch2_pick_time_units(u64 ns);
/*
* quantiles - do not use:
*
* Only enabled if time_stats->quantiles_enabled has been manually set - don't
* Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
* use in new code.
*/
@ -66,7 +67,7 @@ struct time_stat_buffer {
} entries[31];
};
struct time_stats {
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
/* all fields are in nanoseconds */
@ -87,52 +88,50 @@ struct time_stats {
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
struct time_stat_buffer __percpu *buffer;
u64 start_time;
};
struct time_stats_quantiles {
struct time_stats stats;
struct bch2_time_stats_quantiles {
struct bch2_time_stats stats;
struct quantiles quantiles;
};
static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
{
return stats->have_quantiles
? &container_of(stats, struct time_stats_quantiles, stats)->quantiles
? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
: NULL;
}
void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
void __time_stats_update(struct time_stats *stats, u64, u64);
void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
/**
* time_stats_update - collect a new event being tracked
*
* @stats - time_stats to update
* @stats - bch2_time_stats to update
* @start - start time of event, recorded with local_clock()
*
* The end duration of the event will be the current time
*/
static inline void time_stats_update(struct time_stats *stats, u64 start)
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__time_stats_update(stats, start, local_clock());
__bch2_time_stats_update(stats, start, local_clock());
}
/**
* track_event_change - track state change events
*
* @stats - time_stats to update
* @stats - bch2_time_stats to update
* @v - new state, true or false
*
* Use this when tracking time stats for state changes, i.e. resource X becoming
* blocked/unblocked.
*/
static inline bool track_event_change(struct time_stats *stats, bool v)
static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
{
if (v != !!stats->last_event_start) {
if (!v) {
time_stats_update(stats, stats->last_event_start);
bch2_time_stats_update(stats, stats->last_event_start);
stats->last_event_start = 0;
} else {
stats->last_event_start = local_clock() ?: 1;
@ -143,25 +142,18 @@ static inline bool track_event_change(struct time_stats *stats, bool v)
return false;
}
#define TIME_STATS_PRINT_NO_ZEROES (1U << 0) /* print nothing if zero count */
struct seq_buf;
void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
const char *epoch_name, unsigned int flags);
void time_stats_to_json(struct seq_buf *, struct time_stats *,
const char *epoch_name, unsigned int flags);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
void time_stats_exit(struct time_stats *);
void time_stats_init(struct time_stats *);
static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
{
time_stats_exit(&statq->stats);
bch2_time_stats_exit(&statq->stats);
}
static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
{
time_stats_init(&statq->stats);
bch2_time_stats_init(&statq->stats);
statq->stats.have_quantiles = true;
memset(&statq->quantiles, 0, sizeof(statq->quantiles));
}
#endif /* _LINUX_TIME_STATS_H */
#endif /* _BCACHEFS_TIME_STATS_H */

View File

@ -11,7 +11,6 @@
#include <linux/console.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/eytzinger.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/log2.h>
@ -23,8 +22,9 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
#include <linux/mean_and_variance.h>
#include "eytzinger.h"
#include "mean_and_variance.h"
#include "util.h"
static const char si_units[] = "?kMGTPEZY";
@ -339,14 +339,14 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
prt_tab_rjust(out);
@ -363,7 +363,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
#define TABSTOP_SIZE 12
void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
@ -374,7 +374,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
@ -469,7 +469,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
if (quantiles) {
int i = eytzinger0_first(NR_QUANTILES);
const struct time_unit *u =
pick_time_units(quantiles->entries[i].m);
bch2_pick_time_units(quantiles->entries[i].m);
u64 last_q = 0;
prt_printf(out, "quantiles (%s):\t", u->name);
@ -707,6 +707,149 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
static int alignment_ok(const void *base, size_t align)
{
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
((unsigned long)base & (align - 1)) == 0;
}
static void u32_swap(void *a, void *b, size_t size)
{
u32 t = *(u32 *)a;
*(u32 *)a = *(u32 *)b;
*(u32 *)b = t;
}
static void u64_swap(void *a, void *b, size_t size)
{
u64 t = *(u64 *)a;
*(u64 *)a = *(u64 *)b;
*(u64 *)b = t;
}
static void generic_swap(void *a, void *b, size_t size)
{
char t;
do {
t = *(char *)a;
*(char *)a++ = *(char *)b;
*(char *)b++ = t;
} while (--size > 0);
}
static inline int do_cmp(void *base, size_t n, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
size_t l, size_t r)
{
return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size);
}
static inline void do_swap(void *base, size_t n, size_t size,
void (*swap_func)(void *, void *, size_t),
size_t l, size_t r)
{
swap_func(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size);
}
void eytzinger0_sort(void *base, size_t n, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t))
{
int i, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
for (r = i; r * 2 + 1 < n; r = c) {
c = r * 2 + 1;
if (c + 1 < n &&
do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
c++;
if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
break;
do_swap(base, n, size, swap_func, r, c);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
do_swap(base, n, size, swap_func, 0, i);
for (r = 0; r * 2 + 1 < i; r = c) {
c = r * 2 + 1;
if (c + 1 < i &&
do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
c++;
if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
break;
do_swap(base, n, size, swap_func, r, c);
}
}
}
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t size))
{
/* pre-scale counters for performance */
int i = (num/2 - 1) * size, n = num * size, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for ( ; i >= 0; i -= size) {
for (r = i; r * 2 + size < n; r = c) {
c = r * 2 + size;
if (c < n - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
/* sort */
for (i = n - size; i > 0; i -= size) {
swap_func(base, base + i, size);
for (r = 0; r * 2 + size < i; r = c) {
c = r * 2 + size;
if (c < i - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
}
#if 0
void eytzinger1_test(void)
{

View File

@ -5,21 +5,23 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/closure.h>
#include <linux/darray.h>
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/ratelimit.h>
#include <linux/sched/clock.h>
#include <linux/slab.h>
#include <linux/time_stats.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/mean_and_variance.h>
#include "mean_and_variance.h"
#include "darray.h"
#include "time_stats.h"
struct closure;
@ -328,7 +330,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
#define ewma_add(ewma, val, weight) \
({ \
@ -629,6 +631,34 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
&(_array)[(_pos)], \
sizeof((_array)[0]) * ((_nr) - (_pos)))
#define array_insert_item(_array, _nr, _pos, _new_item) \
do { \
__array_insert_item(_array, _nr, _pos); \
(_nr)++; \
(_array)[(_pos)] = (_new_item); \
} while (0)
#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
do { \
(_nr) -= (_nr_to_remove); \
memmove(&(_array)[(_pos)], \
&(_array)[(_pos) + (_nr_to_remove)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
} while (0)
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
static inline void __move_gap(void *array, size_t element_size,
size_t nr, size_t size,
size_t old_gap, size_t new_gap)
@ -743,4 +773,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
void bch2_darray_str_exit(darray_str *);
int bch2_split_devs(const char *, darray_str *);
#ifdef __KERNEL__
__must_check
static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
{
return copy_to_user(to, from, n) ? -EFAULT : 0;
}
__must_check
static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
{
return copy_from_user(to, from, n) ? -EFAULT : 0;
}
#endif
static inline void __set_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
}
#endif /* _BCACHEFS_UTIL_H */

View File

@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
kfree(buf);
if (ret < 0)
return ret;
goto err_class_exit;
ret = bch2_opt_check_may_set(c, opt_id, v);
if (ret < 0)
return ret;
goto err_class_exit;
s.v = v + 1;
s.defined = true;
@ -595,6 +595,7 @@ err:
(opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
err_class_exit:
return bch2_err_class(ret);
}

View File

@ -5,7 +5,7 @@
#include <linux/gfp.h>
#include <linux/kmemleak.h>
#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
struct genradix_node {
@ -14,13 +14,13 @@ struct genradix_node {
struct genradix_node *children[GENRADIX_ARY];
/* Leaf: */
u8 data[PAGE_SIZE];
u8 data[GENRADIX_NODE_SIZE];
};
};
static inline int genradix_depth_shift(unsigned depth)
{
return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
/*
@ -33,7 +33,7 @@ static inline size_t genradix_depth_size(unsigned depth)
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
@ -79,23 +79,12 @@ EXPORT_SYMBOL(__genradix_ptr);
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
struct genradix_node *node;
node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO);
/*
* We're using pages (not slab allocations) directly for kernel data
* structures, so we need to explicitly inform kmemleak of them in order
* to avoid false positive memory leak reports.
*/
kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask);
return node;
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}
static inline void genradix_free_node(struct genradix_node *node)
{
kmemleak_free(node);
free_page((unsigned long)node);
kfree(node);
}
/*
@ -200,7 +189,7 @@ restart:
i++;
iter->offset = round_down(iter->offset + objs_per_ptr,
objs_per_ptr);
iter->pos = (iter->offset >> PAGE_SHIFT) *
iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) *
objs_per_page;
if (i == GENRADIX_ARY)
goto restart;
@ -209,7 +198,7 @@ restart:
n = n->children[i];
}
return &n->data[iter->offset & (PAGE_SIZE - 1)];
return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
}
EXPORT_SYMBOL(__genradix_iter_peek);
@ -235,7 +224,7 @@ restart:
if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
iter->offset = genradix_depth_size(level);
iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
iter->offset -= obj_size_plus_page_remainder;
iter->pos--;
@ -251,7 +240,7 @@ restart:
size_t objs_per_ptr = genradix_depth_size(level);
iter->offset = round_down(iter->offset, objs_per_ptr);
iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
if (!iter->offset)
return NULL;
@ -267,7 +256,7 @@ restart:
n = n->children[i];
}
return &n->data[iter->offset & (PAGE_SIZE - 1)];
return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
}
EXPORT_SYMBOL(__genradix_iter_peek_prev);
@ -289,7 +278,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
{
size_t offset;
for (offset = 0; offset < size; offset += PAGE_SIZE)
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
return -ENOMEM;

View File

@ -277,92 +277,3 @@ void sort_r(void *base, size_t num, size_t size,
}
}
EXPORT_SYMBOL(sort_r);
#include <linux/eytzinger.h>
static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func, const void *priv,
size_t l, size_t r)
{
return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
cmp_func, priv);
}
static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
swap_r_func_t swap_func, const void *priv,
size_t l, size_t r)
{
do_swap(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size, swap_func, priv);
}
void eytzinger0_sort_r(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func,
swap_r_func_t swap_func,
const void *priv)
{
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
swap_func = NULL;
if (!swap_func) {
if (is_aligned(base, size, 8))
swap_func = SWAP_WORDS_64;
else if (is_aligned(base, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
for (r = i; r * 2 + 1 < n; r = c) {
c = r * 2 + 1;
if (c + 1 < n &&
eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
c++;
if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
break;
eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
for (r = 0; r * 2 + 1 < i; r = c) {
c = r * 2 + 1;
if (c + 1 < i &&
eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
c++;
if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
break;
eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
}
}
}
EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
void eytzinger0_sort(void *base, size_t n, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func)
{
struct wrapper w = {
.cmp = cmp_func,
.swap = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
EXPORT_SYMBOL_GPL(eytzinger0_sort);

View File

@ -1,373 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/eytzinger.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/time.h>
#include <linux/time_stats.h>
#include <linux/spinlock.h>
static const struct time_unit time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "d", (u64) NSEC_PER_SEC * 3600 * 24},
{ "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
{ "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
{ "eon", U64_MAX },
};
const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
EXPORT_SYMBOL_GPL(pick_time_units);
static void quantiles_update(struct quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void time_stats_update_one(struct time_stats *stats,
u64 start, u64 end)
{
u64 duration, freq;
bool initted = stats->last_event != 0;
if (time_after64(end, start)) {
struct quantiles *quantiles = time_stats_to_quantiles(stats);
duration = end - start;
mean_and_variance_update(&stats->duration_stats, duration);
mean_and_variance_weighted_update(&stats->duration_stats_weighted,
duration, initted, TIME_STATS_MV_WEIGHT);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
if (quantiles)
quantiles_update(quantiles, duration);
}
if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted,
freq, initted, TIME_STATS_MV_WEIGHT);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
}
stats->last_event = end;
}
void __time_stats_clear_buffer(struct time_stats *stats,
struct time_stat_buffer *b)
{
for (struct time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
time_stats_update_one(stats, i->start, i->end);
b->nr = 0;
}
EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
static noinline void time_stats_clear_buffer(struct time_stats *stats,
struct time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
time_stats_update_one(stats, start, end);
if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct time_stat_buffer_entry) {
.start = start,
.end = end
};
if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
EXPORT_SYMBOL_GPL(__time_stats_update);
#include <linux/seq_buf.h>
static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
}
static inline u64 time_stats_lifetime(const struct time_stats *stats)
{
return local_clock() - stats->start_time;
}
void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
const char *epoch_name, unsigned int flags)
{
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
u64 f_stddev = 0, d_stddev = 0;
u64 lifetime = time_stats_lifetime(stats);
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
if (stats->freq_stats.n) {
/* avoid divide by zero */
f_mean = mean_and_variance_get_mean(stats->freq_stats);
f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
d_mean = mean_and_variance_get_mean(stats->duration_stats);
d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
/* unless we didn't want zeroes anyway */
return;
}
seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
seq_buf_printf(out, "lifetime: ");
seq_buf_time_units_aligned(out, lifetime);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " since %-12s recent\n", epoch_name);
seq_buf_printf(out, "duration of events\n");
seq_buf_printf(out, " min: ");
seq_buf_time_units_aligned(out, stats->min_duration);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " max: ");
seq_buf_time_units_aligned(out, stats->max_duration);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " total: ");
seq_buf_time_units_aligned(out, stats->total_duration);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " mean: ");
seq_buf_time_units_aligned(out, d_mean);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
seq_buf_printf(out, " stddev: ");
seq_buf_time_units_aligned(out, d_stddev);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
seq_buf_printf(out, "time between events\n");
seq_buf_printf(out, " min: ");
seq_buf_time_units_aligned(out, stats->min_freq);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " max: ");
seq_buf_time_units_aligned(out, stats->max_freq);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " mean: ");
seq_buf_time_units_aligned(out, f_mean);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
seq_buf_printf(out, " stddev: ");
seq_buf_time_units_aligned(out, f_stddev);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
if (quantiles) {
int i = eytzinger0_first(NR_QUANTILES);
const struct time_unit *u =
pick_time_units(quantiles->entries[i].m);
u64 last_q = 0;
seq_buf_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
if (is_last)
seq_buf_printf(out, "\n");
last_q = q;
}
}
}
EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
void time_stats_to_json(struct seq_buf *out, struct time_stats *stats,
const char *epoch_name, unsigned int flags)
{
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
if (stats->freq_stats.n) {
/* avoid divide by zero */
f_mean = mean_and_variance_get_mean(stats->freq_stats);
f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
d_mean = mean_and_variance_get_mean(stats->duration_stats);
d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
/* unless we didn't want zeroes anyway */
return;
}
seq_buf_printf(out, "{\n");
seq_buf_printf(out, " \"epoch\": \"%s\",\n", epoch_name);
seq_buf_printf(out, " \"count\": %llu,\n", stats->duration_stats.n);
seq_buf_printf(out, " \"duration_ns\": {\n");
seq_buf_printf(out, " \"min\": %llu,\n", stats->min_duration);
seq_buf_printf(out, " \"max\": %llu,\n", stats->max_duration);
seq_buf_printf(out, " \"total\": %llu,\n", stats->total_duration);
seq_buf_printf(out, " \"mean\": %llu,\n", d_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev);
seq_buf_printf(out, " },\n");
d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
seq_buf_printf(out, " \"duration_ewma_ns\": {\n");
seq_buf_printf(out, " \"mean\": %llu,\n", d_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev);
seq_buf_printf(out, " },\n");
seq_buf_printf(out, " \"between_ns\": {\n");
seq_buf_printf(out, " \"min\": %llu,\n", stats->min_freq);
seq_buf_printf(out, " \"max\": %llu,\n", stats->max_freq);
seq_buf_printf(out, " \"mean\": %llu,\n", f_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev);
seq_buf_printf(out, " },\n");
f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
seq_buf_printf(out, " \"between_ewma_ns\": {\n");
seq_buf_printf(out, " \"mean\": %llu,\n", f_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev);
if (quantiles) {
u64 last_q = 0;
/* close between_ewma_ns but signal more items */
seq_buf_printf(out, " },\n");
seq_buf_printf(out, " \"quantiles_ns\": [\n");
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
seq_buf_printf(out, " %llu", q);
if (!is_last)
seq_buf_printf(out, ", ");
last_q = q;
}
seq_buf_printf(out, " ]\n");
} else {
/* close between_ewma_ns without dumping further */
seq_buf_printf(out, " }\n");
}
seq_buf_printf(out, "}\n");
}
EXPORT_SYMBOL_GPL(time_stats_to_json);
void time_stats_exit(struct time_stats *stats)
{
free_percpu(stats->buffer);
}
EXPORT_SYMBOL_GPL(time_stats_exit);
void time_stats_init(struct time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
stats->min_duration = U64_MAX;
stats->min_freq = U64_MAX;
stats->start_time = local_clock();
spin_lock_init(&stats->lock);
}
EXPORT_SYMBOL_GPL(time_stats_init);
MODULE_AUTHOR("Kent Overstreet");
MODULE_LICENSE("GPL");