Update bcachefs sources to 50847e296b34 bcachefs: Check subvol <-> inode pointers in check_inode()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-02-05 23:09:25 -05:00
parent 1ef396b684
commit f3f005c76e
85 changed files with 2248 additions and 1744 deletions

View File

@ -1 +1 @@
481b5f34324809f47a58ed798d038fb17e5b7b0a
50847e296b34efabe199e408ec4d72f10a866c39

View File

@ -273,11 +273,20 @@ update-bcachefs-sources:
git add include/linux/kmemleak.h
cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
git add linux/int_sqrt.c
git rm -f libbcachefs/mean_and_variance_test.c
# cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
# git add linux/mean_and_variance.c
# cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
# git add include/linux/mean_and_variance.h
cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
git add linux/mean_and_variance.c
cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
git add include/linux/mean_and_variance.h
cp $(LINUX_DIR)/lib/time_stats.c linux/
git add linux/time_stats.c
cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/
git add include/linux/time_stats.h
cp $(LINUX_DIR)/include/linux/darray.h include/linux/
git add include/linux/darray.h
cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/
git add include/linux/darray_types.h
cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/
git add include/linux/eytzinger.h
cp $(LINUX_DIR)/scripts/Makefile.compiler ./
git add Makefile.compiler
$(RM) libbcachefs/*.mod.c

View File

@ -23,12 +23,13 @@
#include "cmds.h"
#include "libbcachefs.h"
#include "crypto.h"
#include "libbcachefs/darray.h"
#include "libbcachefs/errcode.h"
#include "libbcachefs/opts.h"
#include "libbcachefs/super-io.h"
#include "libbcachefs/util.h"
#include "linux/darray.h"
#define OPTS \
x(0, replicas, required_argument) \
x(0, encrypted, no_argument) \

View File

@ -9,13 +9,14 @@
#include "libbcachefs/bcachefs_ioctl.h"
#include "libbcachefs/buckets.h"
#include "libbcachefs/darray.h"
#include "libbcachefs/opts.h"
#include "libbcachefs/super-io.h"
#include "cmds.h"
#include "libbcachefs.h"
#include "linux/darray.h"
static void __dev_usage_type_to_text(struct printbuf *out,
enum bch_data_type type,
unsigned bucket_size,

View File

@ -20,7 +20,7 @@
#include <linux/uuid.h>
#include "libbcachefs/bcachefs.h"
#include "libbcachefs/bbpos.h"
#include "libbcachefs/darray.h"
#include "linux/darray.h"
#define noreturn __attribute__((noreturn))

View File

@ -1,34 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DARRAY_H
#define _BCACHEFS_DARRAY_H
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#ifndef _LINUX_DARRAY_H
#define _LINUX_DARRAY_H
/*
* Dynamic arrays:
* Dynamic arrays
*
* Inspired by CCAN's darray
*/
#include <linux/darray_types.h>
#include <linux/slab.h>
#define DARRAY_PREALLOCATED(_type, _nr) \
struct { \
size_t nr, size; \
_type *data; \
_type preallocated[_nr]; \
}
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t);
static inline int __darray_resize(darray_char *d, size_t element_size,
size_t new_size, gfp_t gfp)
{
return unlikely(new_size > d->size)
? __bch2_darray_resize(d, element_size, new_size, gfp)
? __darray_resize_slowpath(d, element_size, new_size, gfp)
: 0;
}
@ -69,6 +61,28 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
#define darray_first(_d) ((_d).data[0])
#define darray_last(_d) ((_d).data[(_d).nr - 1])
/* Insert/remove items into the middle of a darray: */
#define array_insert_item(_array, _nr, _pos, _new_item) \
do { \
memmove(&(_array)[(_pos) + 1], \
&(_array)[(_pos)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
(_nr)++; \
(_array)[(_pos)] = (_new_item); \
} while (0)
#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
do { \
(_nr) -= (_nr_to_remove); \
memmove(&(_array)[(_pos)], \
&(_array)[(_pos) + (_nr_to_remove)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
} while (0)
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
#define darray_insert_item(_d, pos, _item) \
({ \
size_t _pos = (pos); \
@ -79,10 +93,15 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
_ret; \
})
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
#define darray_remove_items(_d, _pos, _nr_to_remove) \
array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove)
#define __darray_for_each(_d, _i) \
#define darray_remove_item(_d, _pos) \
darray_remove_items(_d, _pos, 1)
/* Iteration: */
#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each(_d, _i) \
@ -106,4 +125,4 @@ do { \
darray_init(_d); \
} while (0)
#endif /* _BCACHEFS_DARRAY_H */
#endif /* _LINUX_DARRAY_H */

View File

@ -0,0 +1,22 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#ifndef _LINUX_DARRAY_TYpES_H
#define _LINUX_DARRAY_TYpES_H
#include <linux/types.h>
#define DARRAY_PREALLOCATED(_type, _nr) \
struct { \
size_t nr, size; \
_type *data; \
_type preallocated[_nr]; \
}
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
#endif /* _LINUX_DARRAY_TYpES_H */

View File

@ -1,27 +1,37 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _EYTZINGER_H
#define _EYTZINGER_H
#ifndef _LINUX_EYTZINGER_H
#define _LINUX_EYTZINGER_H
#include <linux/bitops.h>
#include <linux/log2.h>
#include "util.h"
#ifdef EYTZINGER_DEBUG
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
#endif
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
* array
*/
/*
* One based indexing version:
* array.
*
* With one based indexing each level of the tree starts at a power of two -
* good for cacheline alignment:
* Consider using an eytzinger tree any time you would otherwise be doing binary
* search over an array. Binary search is a worst case scenario for branch
* prediction and prefetching, but in an eytzinger tree every node's children
* are adjacent in memory, thus we can prefetch children before knowing the
* result of the comparison, assuming multiple nodes fit on a cacheline.
*
* Two variants are provided, for one based indexing and zero based indexing.
*
* Zero based indexing is more convenient, but one based indexing has better
* alignment and thus better performance because each new level of the tree
* starts at a power of two, and thus if element 0 was cacheline aligned, each
* new level will be as well.
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
EBUG_ON(child > 1);
EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child;
}
@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
EBUG_ON(i > size);
EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
EBUG_ON(i > size);
EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
EBUG_ON(!i || i > size);
EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
EBUG_ON(!i || i > size);
EYTZINGER_BUG_ON(!i || i > size);
/*
* sign bit trick:
@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
EBUG_ON(child > 1);
EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, const void *search)
cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
} while (n < nr);
if (n & 1) {
@ -274,8 +282,8 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
_i; \
})
void eytzinger0_sort(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
void eytzinger0_sort_r(void *, size_t, size_t,
cmp_r_func_t, swap_r_func_t, const void *);
void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */
#endif /* _LINUX_EYTZINGER_H */

View File

@ -17,7 +17,7 @@
* Rust and rustc has issues with u128.
*/
#if defined(__SIZEOF_INT128__) && defined(__KERNEL__)
#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
typedef struct {
unsigned __int128 v;
@ -154,8 +154,6 @@ struct mean_and_variance {
/* expontentially weighted variant */
struct mean_and_variance_weighted {
bool init;
u8 weight; /* base 2 logarithim */
s64 mean;
u64 variance;
};
@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
u64 mean_and_variance_get_variance(struct mean_and_variance s1);
u32 mean_and_variance_get_stddev(struct mean_and_variance s);
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
s64 v, bool initted, u8 weight);
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
u8 weight);
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
u8 weight);
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
u8 weight);
#endif // MEAN_AND_VAIRANCE_H_

View File

@ -90,6 +90,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
(void *) size);
}
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kvfree(void *element, void *pool_data);
static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
}
static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
{
return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
}
/*
* A mempool_alloc_t and mempool_free_t for a simple page allocator that
* allocates pages of the order specified by pool_data

View File

@ -1,65 +1 @@
#ifndef __TOOLS_LINUX_SPINLOCK_H
#define __TOOLS_LINUX_SPINLOCK_H
#include <linux/atomic.h>
#include <pthread.h>
typedef struct {
pthread_mutex_t lock;
} raw_spinlock_t;
#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER }
static inline void raw_spin_lock_init(raw_spinlock_t *lock)
{
pthread_mutex_init(&lock->lock, NULL);
}
static inline bool raw_spin_trylock(raw_spinlock_t *lock)
{
return !pthread_mutex_trylock(&lock->lock);
}
static inline void raw_spin_lock(raw_spinlock_t *lock)
{
pthread_mutex_lock(&lock->lock);
}
static inline void raw_spin_unlock(raw_spinlock_t *lock)
{
pthread_mutex_unlock(&lock->lock);
}
#define raw_spin_lock_irq(lock) raw_spin_lock(lock)
#define raw_spin_unlock_irq(lock) raw_spin_unlock(lock)
#define raw_spin_lock_irqsave(lock, flags) \
do { \
flags = 0; \
raw_spin_lock(lock); \
} while (0)
#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
typedef raw_spinlock_t spinlock_t;
#define __SPIN_LOCK_UNLOCKED(name) __RAW_SPIN_LOCK_UNLOCKED(name)
#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
#define spin_lock_init(lock) raw_spin_lock_init(lock)
#define spin_lock(lock) raw_spin_lock(lock)
#define spin_unlock(lock) raw_spin_unlock(lock)
#define spin_lock_nested(lock, n) spin_lock(lock)
#define spin_lock_bh(lock) raw_spin_lock(lock)
#define spin_unlock_bh(lock) raw_spin_unlock(lock)
#define spin_lock_irq(lock) raw_spin_lock(lock)
#define spin_unlock_irq(lock) raw_spin_unlock(lock)
#define spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags)
#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
#endif /* __TOOLS_LINUX_SPINLOCK_H */
#include "linux/spinlock_types.h"

View File

@ -0,0 +1,65 @@
#ifndef __TOOLS_LINUX_SPINLOCK_H
#define __TOOLS_LINUX_SPINLOCK_H
#include <linux/atomic.h>
#include <pthread.h>
typedef struct {
pthread_mutex_t lock;
} raw_spinlock_t;
#define __RAW_SPIN_LOCK_UNLOCKED(name) (raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER }
static inline void raw_spin_lock_init(raw_spinlock_t *lock)
{
pthread_mutex_init(&lock->lock, NULL);
}
static inline bool raw_spin_trylock(raw_spinlock_t *lock)
{
return !pthread_mutex_trylock(&lock->lock);
}
static inline void raw_spin_lock(raw_spinlock_t *lock)
{
pthread_mutex_lock(&lock->lock);
}
static inline void raw_spin_unlock(raw_spinlock_t *lock)
{
pthread_mutex_unlock(&lock->lock);
}
#define raw_spin_lock_irq(lock) raw_spin_lock(lock)
#define raw_spin_unlock_irq(lock) raw_spin_unlock(lock)
#define raw_spin_lock_irqsave(lock, flags) \
do { \
flags = 0; \
raw_spin_lock(lock); \
} while (0)
#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
typedef raw_spinlock_t spinlock_t;
#define __SPIN_LOCK_UNLOCKED(name) __RAW_SPIN_LOCK_UNLOCKED(name)
#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
#define spin_lock_init(lock) raw_spin_lock_init(lock)
#define spin_lock(lock) raw_spin_lock(lock)
#define spin_unlock(lock) raw_spin_unlock(lock)
#define spin_lock_nested(lock, n) spin_lock(lock)
#define spin_lock_bh(lock) raw_spin_lock(lock)
#define spin_unlock_bh(lock) raw_spin_unlock(lock)
#define spin_lock_irq(lock) raw_spin_lock(lock)
#define spin_unlock_irq(lock) raw_spin_unlock(lock)
#define spin_lock_irqsave(lock, flags) raw_spin_lock_irqsave(lock, flags)
#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
#endif /* __TOOLS_LINUX_SPINLOCK_H */

View File

@ -0,0 +1,15 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#ifndef _LINUX_THREAD_WITH_FILE_H
#define _LINUX_THREAD_WITH_FILE_H
struct stdio_redirect;
__printf(3, 0)
static inline void stdio_redirect_vprintf(struct stdio_redirect *s, bool nonblocking, const char *msg, va_list args) {}
__printf(3, 4)
static inline void stdio_redirect_printf(struct stdio_redirect *s, bool nonblocking, const char *msg, ...) {}
#endif /* _LINUX_THREAD_WITH_FILE_H */

View File

0
include/linux/time.h Normal file
View File

167
include/linux/time_stats.h Normal file
View File

@ -0,0 +1,167 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* time_stats - collect statistics on events that have a duration, with nicely
* formatted textual output on demand
*
* - percpu buffering of event collection: cheap enough to shotgun
* everywhere without worrying about overhead
*
* tracks:
* - number of events
* - maximum event duration ever seen
* - sum of all event durations
* - average event duration, standard and weighted
* - standard deviation of event durations, standard and weighted
* and analagous statistics for the frequency of events
*
* We provide both mean and weighted mean (exponentially weighted), and standard
* deviation and weighted standard deviation, to give an efficient-to-compute
* view of current behaviour versus. average behaviour - "did this event source
* just become wonky, or is this typical?".
*
* Particularly useful for tracking down latency issues.
*/
#ifndef _LINUX_TIME_STATS_H
#define _LINUX_TIME_STATS_H
#include <linux/mean_and_variance.h>
#include <linux/sched/clock.h>
#include <linux/spinlock_types.h>
#include <linux/string.h>
struct time_unit {
const char *name;
u64 nsecs;
};
/*
* given a nanosecond value, pick the preferred time units for printing:
*/
const struct time_unit *pick_time_units(u64 ns);
/*
* quantiles - do not use:
*
* Only enabled if time_stats->quantiles_enabled has been manually set - don't
* use in new code.
*/
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
struct quantiles {
struct quantile_entry {
u64 m;
u64 step;
} entries[NR_QUANTILES];
};
struct time_stat_buffer {
unsigned nr;
struct time_stat_buffer_entry {
u64 start;
u64 end;
} entries[31];
};
struct time_stats {
spinlock_t lock;
bool have_quantiles;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
u64 last_event_start;
struct mean_and_variance duration_stats;
struct mean_and_variance freq_stats;
/* default weight for weighted mean and variance calculations */
#define TIME_STATS_MV_WEIGHT 8
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
struct time_stat_buffer __percpu *buffer;
u64 start_time;
};
struct time_stats_quantiles {
struct time_stats stats;
struct quantiles quantiles;
};
static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats)
{
return stats->have_quantiles
? &container_of(stats, struct time_stats_quantiles, stats)->quantiles
: NULL;
}
void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
void __time_stats_update(struct time_stats *stats, u64, u64);
/**
* time_stats_update - collect a new event being tracked
*
* @stats - time_stats to update
* @start - start time of event, recorded with local_clock()
*
* The end duration of the event will be the current time
*/
static inline void time_stats_update(struct time_stats *stats, u64 start)
{
__time_stats_update(stats, start, local_clock());
}
/**
* track_event_change - track state change events
*
* @stats - time_stats to update
* @v - new state, true or false
*
* Use this when tracking time stats for state changes, i.e. resource X becoming
* blocked/unblocked.
*/
static inline bool track_event_change(struct time_stats *stats, bool v)
{
if (v != !!stats->last_event_start) {
if (!v) {
time_stats_update(stats, stats->last_event_start);
stats->last_event_start = 0;
} else {
stats->last_event_start = local_clock() ?: 1;
return true;
}
}
return false;
}
#define TIME_STATS_PRINT_NO_ZEROES (1U << 0) /* print nothing if zero count */
struct seq_buf;
void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *,
const char *epoch_name, unsigned int flags);
void time_stats_to_json(struct seq_buf *, struct time_stats *,
const char *epoch_name, unsigned int flags);
void time_stats_exit(struct time_stats *);
void time_stats_init(struct time_stats *);
static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq)
{
time_stats_exit(&statq->stats);
}
static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq)
{
time_stats_init(&statq->stats);
statq->stats.have_quantiles = true;
memset(&statq->quantiles, 0, sizeof(statq->quantiles));
}
#endif /* _LINUX_TIME_STATS_H */

View File

@ -8,6 +8,7 @@
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/posix_types.h>
#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
#include <asm/types.h>
@ -77,6 +78,10 @@ typedef __u64 __bitwise __be64;
typedef u64 sector_t;
typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
typedef void (*swap_func_t)(void *a, void *b, int size);
typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
typedef int (*cmp_func_t)(const void *a, const void *b);
typedef unsigned int __bitwise slab_flags_t;

View File

@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
&c->blocked_allocate_open_bucket, true);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
&c->blocked_allocate_open_bucket, false);
track_event_change(&c->times[BCH_TIME_blocked_allocate],
&c->blocked_allocate, false);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
spin_unlock(&c->freelist_lock);
return ob;
@ -555,8 +551,7 @@ again:
goto again;
}
track_event_change(&c->times[BCH_TIME_blocked_allocate],
&c->blocked_allocate, true);
track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;

View File

@ -200,6 +200,8 @@
#include <linux/seqlock.h>
#include <linux/shrinker.h>
#include <linux/srcu.h>
#include <linux/thread_with_file_types.h>
#include <linux/time_stats.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
@ -465,7 +467,6 @@ enum bch_time_stats {
#include "replicas_types.h"
#include "subvolume_types.h"
#include "super_types.h"
#include "thread_with_file_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@ -593,7 +594,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
struct bch2_time_stats io_latency[2];
struct time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@ -640,8 +641,8 @@ struct btree_debug {
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
struct bch2_time_stats duration;
struct bch2_time_stats lock_hold_times;
struct time_stats duration;
struct time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
unsigned journal_entries_size;
@ -919,8 +920,6 @@ struct bch_fs {
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
u64 blocked_allocate;
u64 blocked_allocate_open_bucket;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@ -1104,7 +1103,7 @@ struct bch_fs {
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];

View File

@ -1275,7 +1275,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10) \
x(write_buffer_keys, 11)
x(write_buffer_keys, 11) \
x(datetime, 12)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@ -1376,6 +1377,11 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
struct jset_entry_datetime {
struct jset_entry entry;
__le64 seconds;
} __packed __aligned(8);
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique

View File

@ -379,7 +379,7 @@ struct bch_ioctl_disk_resize_journal {
struct bch_ioctl_subvolume {
__u32 flags;
__s32 dirfd;
__u32 dirfd;
__u16 mode;
__u16 pad[3];
__u64 dst_ptr;

View File

@ -9,12 +9,12 @@
#include "bcachefs.h"
#include "btree_cache.h"
#include "bset.h"
#include "eytzinger.h"
#include "trace.h"
#include "util.h"
#include <asm/unaligned.h>
#include <linux/console.h>
#include <linux/eytzinger.h>
#include <linux/random.h>
#include <linux/prefetch.h>

View File

@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
kvpfree(b->data, btree_buf_bytes(b));
kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
b->data = kvpmalloc(btree_buf_bytes(b), gfp);
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
kvpfree(b->data, btree_buf_bytes(b));
kvfree(b->data);
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
kvpfree(c->verify_ondisk, c->opts.btree_node_size);
kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@ -648,7 +648,7 @@ out:
bch2_btree_keys_init(b);
set_btree_node_accessed(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
memalloc_nofs_restore(flags);
@ -711,6 +711,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
if (!path)
return b;
trans->memory_allocation_failure = true;
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@ -760,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
if (path)
trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
BUG_ON(!path);
trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
@ -1096,7 +1100,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
BUG_ON(trans && !btree_node_locked(path, level + 1));
BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);

View File

@ -389,7 +389,8 @@ again:
have_child = dropped_children = false;
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@ -478,7 +479,8 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
@ -931,7 +933,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
bch2_bkey_buf_init(&prev);
bch2_bkey_buf_init(&cur);
bkey_init(&prev.k->k);
@ -963,7 +965,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
@ -1491,7 +1492,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
@ -1970,7 +1971,7 @@ int bch2_gc_gens(struct bch_fs *c)
c->gc_count++;
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
err:
for_each_member_device(c, ca) {

View File

@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
vpfree(p, size);
kvfree(p);
}
static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
if (sorting_entire_node)
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);
/* Make sure we preserve bset journal_seq: */
@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
&dst->format,
true);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);
set_btree_bset_end(dst, dst->set);
@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
out:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@ -1323,7 +1323,7 @@ start:
}
}
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);

View File

@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret = 0;
__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
path = &trans->paths[path_idx];
if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
time_after64(now, trans->last_begin_time + 10))
__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
__time_stats_update(&btree_trans_stats(trans)->duration,
trans->last_begin_time, now);
if (!trans->restarted &&
@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
kfree(s->max_paths_text);
bch2_time_stats_exit(&s->lock_hold_times);
time_stats_exit(&s->lock_hold_times);
}
if (c->btree_trans_barrier_initialized)
@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
bch2_time_stats_init(&s->duration);
bch2_time_stats_init(&s->lock_hold_times);
time_stats_init(&s->duration);
time_stats_init(&s->lock_hold_times);
mutex_init(&s->lock);
}

View File

@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
@ -334,9 +336,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
iter->pos = bpos_successor(iter->pos);
}
static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
{
struct btree_and_journal_iter iter = *_iter;
struct bch_fs *c = iter.trans->c;
unsigned level = iter.journal.level;
struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (level > 1 ? 0 : 2)
: (level > 1 ? 1 : 16);
iter.prefetch = false;
bch2_bkey_buf_init(&tmp);
while (nr--) {
bch2_btree_and_journal_iter_advance(&iter);
struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
if (!k.k)
break;
bch2_bkey_buf_reassemble(&tmp, c, k);
bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
}
bch2_bkey_buf_exit(&tmp, c);
}
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k, ret;
if (iter->prefetch && iter->journal.level)
btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
@ -376,17 +407,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct bch_fs *c,
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
@ -396,15 +428,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct bch_fs *c,
void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
list_add(&iter->journal.list, &c->journal_iters);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@ -415,9 +447,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
struct genradix_iter iter;
genradix_for_each(&c->journal_entries, iter, i)
if (*i)
kvpfree(*i, offsetof(struct journal_replay, j) +
vstruct_bytes(&(*i)->j));
kvfree(*i);
genradix_free(&c->journal_entries);
}

View File

@ -15,6 +15,7 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
struct btree_trans *trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@ -22,6 +23,7 @@ struct btree_and_journal_iter {
struct journal_iter journal;
struct bpos pos;
bool at_end;
bool prefetch;
};
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
struct btree_and_journal_iter *);
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *, struct btree *,
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
struct btree_and_journal_iter *, struct btree *,
struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
struct btree_and_journal_iter *, struct btree *);
void bch2_journal_keys_put(struct bch_fs *);

View File

@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
__time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
path->l[level].lock_taken_time,
local_clock());
#endif

View File

@ -2,12 +2,12 @@
#ifndef _BCACHEFS_BTREE_TYPES_H
#define _BCACHEFS_BTREE_TYPES_H
#include <linux/darray_types.h>
#include <linux/list.h>
#include <linux/rhashtable.h>
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
#include "journal_types.h"
#include "replicas_types.h"

View File

@ -14,6 +14,8 @@
#include "snapshot.h"
#include "trace.h"
#include <linux/darray.h>
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{

View File

@ -516,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
bch2_disk_reservation_put(c, &as->disk_res);
bch2_btree_reserve_put(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);
mutex_lock(&c->btree_interior_update_lock);
@ -1038,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
start_time);
}
@ -1629,7 +1629,7 @@ out:
bch2_trans_verify_locks(trans);
bch2_time_stats_update(&c->times[n2
time_stats_update(&c->times[n2
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact],
start_time);
@ -1935,7 +1935,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_done(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
err:
if (new_path)
@ -2484,7 +2484,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
c->btree_interior_update_worker =
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#include "darray.h"
#include <linux/darray_types.h>
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4

View File

@ -1335,7 +1335,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
kvfree(buckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@ -1345,16 +1345,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO))) {
if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
if ((c->opts.buckets_nouse &&
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)))) {
!(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)))) {
ret = -BCH_ERR_ENOMEM_buckets_nouse;
goto err;
}
@ -1397,8 +1397,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
@ -1407,27 +1406,21 @@ err:
void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
kvfree(ca->buckets_nouse);
kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
sizeof(struct bucket_gens) + ca->mi.nbuckets);
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -BCH_ERR_ENOMEM_usage_init;
for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -BCH_ERR_ENOMEM_usage_init;

View File

@ -11,7 +11,6 @@
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "thread_with_file.h"
#include <linux/cdev.h>
#include <linux/device.h>
@ -20,6 +19,7 @@
#include <linux/major.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/thread_with_file.h>
#include <linux/uaccess.h>
__must_check
@ -155,17 +155,14 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
kfree(thr);
}
static int bch2_fsck_offline_thread_fn(void *arg)
static void bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
if (!thr->thr.thr.ret)
bch2_fs_stop(c);
thread_with_stdio_done(&thr->thr);
return 0;
}
static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
@ -220,7 +217,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
ret = bch2_run_thread_with_stdio(&thr->thr,
ret = run_thread_with_stdio(&thr->thr,
bch2_fsck_thread_exit,
bch2_fsck_offline_thread_fn);
err:
@ -425,7 +422,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
{
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
bch2_thread_with_file_exit(&ctx->thr);
thread_with_file_exit(&ctx->thr);
kfree(ctx);
return 0;
}
@ -475,7 +472,7 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
ret = bch2_run_thread_with_file(&ctx->thr,
ret = run_thread_with_file(&ctx->thr,
&bcachefs_data_ops,
bch2_data_thread);
if (ret < 0)
@ -763,9 +760,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
static int bch2_fsck_online_thread_fn(void *arg)
static void bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = thr->c;
c->stdio_filter = current;
@ -793,11 +790,8 @@ static int bch2_fsck_online_thread_fn(void *arg)
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
thread_with_stdio_done(&thr->thr);
up(&c->online_fsck_mutex);
bch2_ro_ref_put(c);
return 0;
}
static long bch2_ioctl_fsck_online(struct bch_fs *c,
@ -840,7 +834,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
goto err;
}
ret = bch2_run_thread_with_stdio(&thr->thr,
ret = run_thread_with_stdio(&thr->thr,
bch2_fsck_thread_exit,
bch2_fsck_online_thread_fn);
err:

View File

@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return 0;
if (!mempool_initialized(&c->compression_bounce[READ]) &&
mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_read_init;
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
if (mempool_init_kvpmalloc_pool(
if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
if (!mempool_initialized(&c->decompress_workspace) &&
mempool_init_kvpmalloc_pool(&c->decompress_workspace,
1, decompress_workspace_size))
mempool_init_kvmalloc_pool(&c->decompress_workspace,
1, decompress_workspace_size))
return -BCH_ERR_ENOMEM_decompression_workspace_init;
return 0;

View File

@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
kvpfree(n_ondisk, btree_buf_bytes(b));
kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
}

View File

@ -219,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.inode = dir;
dirent->k.p.snapshot = snapshot;
ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
zero_inum, snapshot,
&dirent->k_i, str_hash_flags,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
zero_inum, snapshot,
&dirent->k_i, str_hash_flags,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
*dir_offset = dirent->k.p.offset;
return ret;
@ -293,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
unsigned src_update_flags = 0;
bool delete_src, delete_dst;
int ret = 0;
if (src_dir.subvol != dst_dir.subvol)
return -EXDEV;
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
@ -319,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
src_type = bkey_s_c_to_dirent(old_src).v->d_type;
if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
return -EOPNOTSUPP;
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
@ -352,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
if (dst_type == DT_SUBVOL)
return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
@ -426,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
if (new_dst->v.d_type == DT_SUBVOL)
new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
if ((mode == BCH_RENAME_EXCHANGE) &&
new_src->v.d_type == DT_SUBVOL)
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
out_set_src:
/*
* If we're deleting a subvolume, we need to really delete the dirent,
* not just emit a whiteout in the current snapshot:
* If we're deleting a subvolume we need to really delete the dirent,
* not just emit a whiteout in the current snapshot - there can only be
* single dirent that points to a given subvolume.
*
* IOW, we don't maintain multiple versions in different snapshots of
* dirents that point to subvolumes - dirents that point to subvolumes
* are only visible in one particular subvolume so it's not necessary,
* and it would be particularly confusing for fsck to have to deal with.
*/
if (src_type == DT_SUBVOL) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter);
delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
new_src->k.p.snapshot != old_src.k->p.snapshot;
delete_dst = old_dst.k &&
bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
new_dst->k.p.snapshot != old_dst.k->p.snapshot;
if (!delete_src || !bkey_deleted(&new_src->k)) {
ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
new_src->k.p = src_iter.pos;
src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
}
ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
if (delete_src) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter) ?:
bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
if (ret)
goto out;
}
if (delete_dst) {
bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
ret = bch2_btree_iter_traverse(&dst_iter) ?:
bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
if (ret)
goto out;
}
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
@ -458,41 +472,29 @@ out:
return ret;
}
int __bch2_dirent_lookup_trans(struct btree_trans *trans,
struct btree_iter *iter,
subvol_inum dir,
const struct bch_hash_info *hash_info,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
int bch2_dirent_lookup_trans(struct btree_trans *trans,
struct btree_iter *iter,
subvol_inum dir,
const struct bch_hash_info *hash_info,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
struct bkey_s_c k;
struct bkey_s_c_dirent d;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir, name, flags);
if (ret)
return ret;
ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir, name, flags);
if (ret)
return ret;
k = bch2_btree_iter_peek_slot(iter);
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, dir, d, inum);
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
if (ret > 0)
ret = -ENOENT;
err:
if (ret)
bch2_trans_iter_exit(trans, iter);
return ret;
}
@ -504,7 +506,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
struct btree_iter iter = { NULL };
int ret = lockrestart_do(trans,
__bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;

View File

@ -62,7 +62,7 @@ int bch2_dirent_rename(struct btree_trans *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,

View File

@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
unsigned i;
for (i = 0; i < s->v.nr_blocks; i++) {
kvpfree(buf->data[i], buf->size << 9);
kvfree(buf->data[i]);
buf->data[i] = NULL;
}
}
@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < v->nr_blocks; i++) {
buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}

View File

@ -176,6 +176,8 @@
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
x(EINVAL, opt_parse_error) \
x(EINVAL, remove_with_metadata_missing_unimplemented)\
x(EINVAL, remove_would_lose_data) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \

View File

@ -2,7 +2,7 @@
#include "bcachefs.h"
#include "error.h"
#include "super.h"
#include "thread_with_file.h"
#include <linux/thread_with_file.h>
#define FSCK_ERR_RATELIMIT_NR 10
@ -105,7 +105,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
do {
bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
if (r < 0)
return YN_NO;
buf[r] = '\0';

View File

@ -24,12 +24,12 @@ struct { \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
(fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
(fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
kvpfree((fifo)->data, fifo_buf_size(fifo)); \
kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)

View File

@ -260,8 +260,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_INTENT);
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@ -410,6 +410,21 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
/* Can't move across subvolumes, unless it's a subvolume root: */
if (src_dir.subvol != dst_dir.subvol &&
(!src_inode_u->bi_subvol ||
(dst_inum.inum && !dst_inode_u->bi_subvol))) {
ret = -EXDEV;
goto err;
}
if (src_inode_u->bi_parent_subvol)
src_inode_u->bi_parent_subvol = dst_dir.subvol;
if ((mode == BCH_RENAME_EXCHANGE) &&
dst_inode_u->bi_parent_subvol)
dst_inode_u->bi_parent_subvol = src_dir.subvol;
src_inode_u->bi_dir = dst_dir_u->bi_inum;
src_inode_u->bi_dir_offset = dst_offset;

View File

@ -455,6 +455,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
if (IS_ERR(victim))
return PTR_ERR(victim);
dir = d_inode(path.dentry);
if (victim->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
@ -463,14 +464,13 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
ret = -ENOENT;
goto err;
}
dir = d_inode(path.dentry);
ret = __bch2_unlink(dir, victim, true);
if (!ret) {
fsnotify_rmdir(dir, victim);
d_delete(victim);
}
inode_unlock(dir);
err:
inode_unlock(dir);
dput(victim);
path_put(&path);
return ret;

View File

@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
struct btree_trans *trans;
struct bch_subvolume subvol;
int ret;
subvol_inum inum = inode_inum(inode);
struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
BUG_ON(!old);
inode = to_bch_ei(iget5_locked(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->v.i_state & I_NEW))
return &inode->v;
trans = bch2_trans_get(c);
ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
if (!ret)
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
bch2_trans_put(trans);
if (ret) {
iget_failed(&inode->v);
return ERR_PTR(bch2_err_class(ret));
if (unlikely(old != inode)) {
discard_new_inode(&inode->v);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
* we really don't want insert_inode_locked2() to be setting
* I_NEW...
*/
unlock_new_inode(&inode->v);
}
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
return inode;
}
unlock_new_inode(&inode->v);
#define memalloc_flags_do(_flags, _do) \
({ \
unsigned _saved_flags = memalloc_flags_save(_flags); \
typeof(_do) _ret = _do; \
memalloc_noreclaim_restore(_saved_flags); \
_ret; \
})
return &inode->v;
/*
* Allocate a new inode, dropping/retaking btree locks if necessary:
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct bch_inode_info *inode =
memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
to_bch_ei(new_inode(c->vfs_sb)));
if (unlikely(!inode)) {
int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
if (ret && inode)
discard_new_inode(&inode->v);
if (ret)
return ERR_PTR(ret);
}
return inode;
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_info *inode =
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
if (inode)
return &inode->v;
struct btree_trans *trans = bch2_trans_get(c);
struct bch_inode_unpacked inode_u;
struct bch_subvolume subvol;
int ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (!ret) {
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
}
bch2_trans_put(trans);
return ret ? ERR_PTR(ret) : &inode->v;
}
struct bch_inode_info *
@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap,
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
struct bch_inode_info *inode, *old;
struct bch_inode_info *inode;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
@ -293,7 +336,6 @@ err_before_quota:
mutex_unlock(&dir->ei_update_lock);
}
bch2_iget5_set(&inode->v, &inum);
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@ -304,36 +346,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*/
inode->v.i_state |= I_CREATING;
old = to_bch_ei(inode_insert5(&inode->v,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
BUG_ON(!old);
if (unlikely(old != inode)) {
/*
* We raced, another process pulled the new inode into cache
* before us:
*/
make_bad_inode(&inode->v);
iput(&inode->v);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
* we really don't want insert_inode_locked2() to be setting
* I_NEW...
*/
unlock_new_inode(&inode->v);
}
inode = bch2_inode_insert(c, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@ -352,23 +365,78 @@ err_trans:
/* methods */
static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
subvol_inum dir, struct bch_hash_info *dir_hash_info,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
struct btree_iter dirent_iter = {};
subvol_inum inum = {};
int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
dir_hash_info, dir, name, 0);
if (ret)
return ERR_PTR(ret);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
ret = bkey_err(k);
if (ret)
goto err;
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
goto err;
struct bch_inode_info *inode =
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
if (inode)
goto out;
struct bch_subvolume subvol;
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (bch2_err_matches(ret, ENOENT)) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err(c, "%s points to missing inode", buf.buf);
printbuf_exit(&buf);
}
if (ret)
goto err;
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
return inode;
err:
inode = ERR_PTR(ret);
goto out;
}
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
unsigned int flags)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
subvol_inum inum = { .subvol = 1 };
int ret;
ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
&dentry->d_name, &inum);
struct bch_inode_info *inode;
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
&hash, &dentry->d_name)));
if (IS_ERR(inode))
inode = NULL;
if (!ret)
vinode = bch2_vfs_inode_get(c, inum);
return d_splice_alias(vinode, dentry);
return d_splice_alias(&inode->v, dentry);
}
static int bch2_mknod(struct mnt_idmap *idmap,
@ -1371,6 +1439,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
if (BCH_SUBVOLUME_SNAP(subvol))
@ -1571,7 +1640,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
* number:
*/
u64 avail_inodes = ((usage.capacity - usage.used) << 3);
u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
@ -1582,10 +1650,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = usage.nr_inodes + avail_inodes;
buf->f_ffree = avail_inodes;
fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
buf->f_namelen = BCH_NAME_MAX;
return 0;
@ -1881,6 +1946,7 @@ got_sb:
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
sb->s_uuid = c->sb.user_uuid;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));

View File

@ -5,7 +5,6 @@
#include "btree_cache.h"
#include "btree_update.h"
#include "buckets.h"
#include "darray.h"
#include "dirent.h"
#include "error.h"
#include "fs-common.h"
@ -18,6 +17,7 @@
#include "xattr.h"
#include <linux/bsearch.h>
#include <linux/darray.h>
#include <linux/dcache.h> /* struct qstr */
/*
@ -100,8 +100,8 @@ err:
}
static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
struct bch_inode_unpacked *inode,
u32 *snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@ -123,17 +123,15 @@ err:
return ret;
}
static int __lookup_dirent(struct btree_trans *trans,
static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
u64 *target, unsigned *type)
u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c_dirent d;
int ret;
ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
&hash_info, dir, name, 0);
int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
&hash_info, dir, name, 0, snapshot);
if (ret)
return ret;
@ -144,34 +142,6 @@ static int __lookup_dirent(struct btree_trans *trans,
return 0;
}
static int __write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct bkey_inode_buf *inode_p =
bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
bch2_inode_pack(inode_p, inode);
inode_p->inode.k.p.snapshot = snapshot;
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&inode_p->inode.k_i,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
}
static int fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__write_inode(trans, inode, snapshot));
bch_err_fn(trans->c, ret);
return ret;
}
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
@ -224,15 +194,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot);
u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
bch_err_msg(c, ret, "looking up root inode");
if (ret)
return ret;
root_hash_info = bch2_hash_info_init(c, &root_inode);
ret = __lookup_dirent(trans, root_hash_info, root_inum,
&lostfound_str, &inum, &d_type);
ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
&lostfound_str, &inum, &d_type, snapshot);
if (bch2_err_matches(ret, ENOENT))
goto create_lostfound;
@ -250,7 +221,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
* shouldn't exist here:
*/
ret = lookup_inode(trans, inum, lostfound, &snapshot);
bch_err_msg(c, ret, "looking up lost+found");
bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
return ret;
create_lostfound:
@ -312,7 +284,7 @@ static int reattach_inode(struct btree_trans *trans,
if (S_ISDIR(inode->bi_mode)) {
lostfound.bi_nlink++;
ret = __write_inode(trans, &lostfound, U32_MAX);
ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
if (ret)
return ret;
}
@ -334,7 +306,7 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
return __write_inode(trans, inode, inode_snapshot);
return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
}
static int remove_backpointer(struct btree_trans *trans,
@ -722,7 +694,7 @@ static int hash_redo_key(struct btree_trans *trans,
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
bch2_hash_set_snapshot(trans, desc, hash_info,
bch2_hash_set_in_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
@ -861,7 +833,8 @@ static int check_inode(struct btree_trans *trans,
u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
ret = __write_inode(trans, &u, iter->pos.snapshot);
ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@ -950,8 +923,33 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
if (u.bi_subvol) {
struct bch_subvolume s;
ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret,
c, inode_bi_subvol_missing,
"inode %llu:%u bi_subvol points to missing subvolume %u",
u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
k.k->p.snapshot),
c, inode_bi_subvol_wrong,
"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
u.bi_inum, k.k->p.snapshot, u.bi_subvol,
le64_to_cpu(s.inode),
le32_to_cpu(s.snapshot))) {
u.bi_subvol = 0;
u.bi_parent_subvol = 0;
do_update = true;
}
}
if (do_update) {
ret = __write_inode(trans, &u, iter->pos.snapshot);
ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@ -1032,7 +1030,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
ret = fsck_write_inode(trans, &i->inode, i->snapshot);
ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
@ -1481,7 +1479,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
ret = fsck_write_inode(trans, &i->inode, i->snapshot);
ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
@ -1491,16 +1489,15 @@ fsck_err:
return ret ?: trans_was_restarted(trans, restart_count);
}
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
struct bch_inode_unpacked *target,
u32 target_snapshot)
static int check_inode_backpointer(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
struct bch_inode_unpacked *target,
u32 target_snapshot)
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
struct btree_iter bp_iter = { NULL };
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!target->bi_dir &&
@ -1508,7 +1505,7 @@ static int check_dirent_target(struct btree_trans *trans,
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
ret = __write_inode(trans, target, target_snapshot);
ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
@ -1548,7 +1545,7 @@ static int check_dirent_target(struct btree_trans *trans,
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_unlinked;
ret = __write_inode(trans, target, target_snapshot);
ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
@ -1566,11 +1563,34 @@ static int check_dirent_target(struct btree_trans *trans,
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
ret = __write_inode(trans, target, target_snapshot);
ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
}
out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
struct bch_inode_unpacked *target,
u32 target_snapshot)
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
int ret = 0;
ret = check_inode_backpointer(trans, iter, d, target, target_snapshot);
if (ret)
goto err;
if (fsck_err_on(d.v->d_type != inode_d_type(target),
c, dirent_d_type_wrong,
@ -1614,15 +1634,65 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_dirent d)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked subvol_root;
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 target_snapshot;
u64 target_inum;
int ret = 0;
ret = subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (fsck_err_on(ret, c, dirent_to_missing_subvol,
"dirent points to missing subvolume %u",
le32_to_cpu(d.v->d_child_subvol)))
return __remove_dirent(trans, d.k->p);
ret = lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
bch_err(c, "repair not implemented yet");
return -EINVAL;
}
if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
c, subvol_root_wrong_bi_subvol,
"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
target_inum,
subvol_root.bi_subvol, target_subvol)) {
subvol_root.bi_subvol = target_subvol;
ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
if (ret)
return ret;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
return ret;
fsck_err:
return ret;
}
static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct bch_hash_info *hash_info,
@ -1707,50 +1777,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
struct bch_inode_unpacked subvol_root;
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 target_snapshot;
u64 target_inum;
ret = subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret, c, dirent_to_missing_subvol,
"dirent points to missing subvolume %u",
le32_to_cpu(d.v->d_child_subvol))) {
ret = __remove_dirent(trans, d.k->p);
goto err;
}
ret = lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
bch_err(c, "repair not implemented yet");
ret = -EINVAL;
goto err;
}
if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
c, subvol_root_wrong_bi_subvol,
"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
target_inum,
subvol_root.bi_subvol, target_subvol)) {
subvol_root.bi_subvol = target_subvol;
ret = __write_inode(trans, &subvol_root, target_snapshot);
if (ret)
goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
ret = check_subvol_dirent(trans, iter, d);
if (ret)
goto err;
} else {
@ -1776,12 +1803,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
if (d.v->d_type == DT_DIR)
for_each_visible_inode(c, s, dir, equiv.snapshot, i)
i->count++;
}
if (d.v->d_type == DT_DIR)
for_each_visible_inode(c, s, dir, equiv.snapshot, i)
i->count++;
out:
err:
fsck_err:
@ -1919,7 +1945,7 @@ static int check_root_trans(struct btree_trans *trans)
0, NULL);
root_inode.bi_inum = inum;
ret = __write_inode(trans, &root_inode, snapshot);
ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
bch_err_msg(c, ret, "writing root inode");
}
err:
@ -2291,7 +2317,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
ret = __write_inode(trans, &u, k.k->p.snapshot);
ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
}
fsck_err:
return ret;

View File

@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
return bch2_inode_unpack_slowpath(k, unpacked);
}
static int bch2_inode_peek_nowarn(struct btree_trans *trans,
int bch2_inode_peek_nowarn(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
subvol_inum inum, unsigned flags)
@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
int __bch2_fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct bkey_inode_buf *inode_p =
bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
bch2_inode_pack(inode_p, inode);
inode_p->inode.k.p.snapshot = snapshot;
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&inode_p->inode.k_i,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
}
int bch2_fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_fsck_write_inode(trans, inode, snapshot));
bch_err_fn(trans->c, ret);
return ret;
}
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
struct bch_inode_unpacked u;

View File

@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
void bch2_inode_init_late(struct bch_inode_unpacked *, u64,

View File

@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.op.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
promote_free(c, op);
}
@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
if (rbio->start_time)
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time);
bio_endio(&rbio->bio);
}

View File

@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
bch2_congested_acct(ca, io_latency, now, rw);
__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
__time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
EBUG_ON(op->open_buckets.nr);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_disk_reservation_put(c, &op->res);
if (!(op->flags & BCH_WRITE_MOVE))

View File

@ -27,6 +27,26 @@ static const char * const bch2_journal_errors[] = {
NULL
};
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
}
static bool __journal_entry_is_open(union journal_res_state state)
{
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
static inline unsigned nr_unwritten_journal_entries(struct journal *j)
{
return atomic64_read(&j->seq) - j->seq_ondisk;
}
static bool journal_entry_is_open(struct journal *j)
{
return __journal_entry_is_open(j->reservations);
}
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
@ -54,6 +74,13 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
if (buf->write_done)
prt_printf(out, "write done\n");
else if (buf->write_allocated)
prt_printf(out, "write allocated\n");
else if (buf->write_started)
prt_printf(out, "write started\n");
printbuf_indent_sub(out, 2);
}
@ -66,26 +93,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
}
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
}
static bool __journal_entry_is_open(union journal_res_state state)
{
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
static inline unsigned nr_unwritten_journal_entries(struct journal *j)
{
return atomic64_read(&j->seq) - j->seq_ondisk;
}
static bool journal_entry_is_open(struct journal *j)
{
return __journal_entry_is_open(j->reservations);
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
}
static inline struct journal_buf *
@ -174,21 +182,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
void bch2_journal_do_writes(struct journal *j)
{
for (u64 seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++) {
unsigned idx = seq & JOURNAL_BUF_MASK;
struct journal_buf *w = j->buf + idx;
if (w->write_started && !w->write_allocated)
break;
if (w->write_started)
continue;
if (!journal_state_count(j->reservations, idx)) {
w->write_started = true;
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
}
break;
}
}
/*
* Final processing when the last reference of a journal buffer has been
* dropped. Drop the pin list reference acquired at journal entry open and write
* the buffer, if requested.
*/
void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
void bch2_journal_buf_put_final(struct journal *j, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
lockdep_assert_held(&j->lock);
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
if (write)
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
bch2_journal_do_writes(j);
}
/*
@ -380,11 +407,14 @@ static int journal_entry_open(struct journal *j)
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
bkey_extent_init(&buf->key);
buf->noflush = false;
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
buf->noflush = false;
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
buf->write_started = false;
buf->write_allocated = false;
buf->write_done = false;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@ -418,9 +448,10 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
if (nr_unwritten_journal_entries(j) == 1)
mod_delayed_work(j->wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
if (j->early_journal_entries.nr)
@ -445,20 +476,16 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
long delta;
spin_lock(&j->lock);
if (!__journal_entry_is_open(j->reservations))
goto unlock;
if (__journal_entry_is_open(j->reservations)) {
long delta = journal_cur_buf(j)->expires - jiffies;
delta = journal_cur_buf(j)->expires - jiffies;
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
if (delta > 0)
mod_delayed_work(j->wq, &j->write_work, delta);
else
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
}
spin_unlock(&j->lock);
}
@ -473,33 +500,32 @@ retry:
if (journal_res_get_fast(j, res, flags))
return 0;
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
ret = JOURNAL_ERR_journal_full;
can_discard = j->can_discard;
goto out;
}
if (j->blocked)
return -BCH_ERR_journal_res_get_blocked;
if (bch2_journal_error(j))
return -BCH_ERR_erofs_journal_err;
spin_lock(&j->lock);
/* check once more in case somebody else shut things down... */
if (bch2_journal_error(j)) {
spin_unlock(&j->lock);
return -BCH_ERR_erofs_journal_err;
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
ret = JOURNAL_ERR_max_in_flight;
goto out;
}
spin_lock(&j->lock);
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
* unnecessarily
*/
if (journal_res_get_fast(j, res, flags)) {
spin_unlock(&j->lock);
return 0;
}
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
/*
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
ret = JOURNAL_ERR_journal_full;
ret = 0;
goto unlock;
}
@ -515,30 +541,30 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
if (trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
bch2_journal_bufs_to_text(&buf, j);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
}
count_event(c, journal_entry_full);
}
ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
if (!ret)
out:
if (ret == JOURNAL_ERR_retry)
goto retry;
if (!ret)
return 0;
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
if (ret == JOURNAL_ERR_max_in_flight &&
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
bch2_journal_bufs_to_text(&buf, j);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
count_event(c, journal_entry_full);
}
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@ -727,7 +753,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
if (!ret)
bch2_time_stats_update(j->flush_seq_time, start_time);
time_stats_update(j->flush_seq_time, start_time);
return ret ?: ret2 < 0 ? ret2 : 0;
}
@ -1157,7 +1183,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
@ -1211,8 +1236,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
darray_for_each(i->ptrs, ptr)
bch2_dev_list_add_dev(&p->devs, ptr->dev);
had_entries = true;
}
@ -1240,13 +1265,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
void bch2_dev_journal_exit(struct bch_dev *ca)
{
kfree(ca->journal.bio);
kfree(ca->journal.buckets);
kfree(ca->journal.bucket_seq);
struct journal_device *ja = &ca->journal;
ca->journal.bio = NULL;
ca->journal.buckets = NULL;
ca->journal.bucket_seq = NULL;
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
kfree(ja->bio[i]);
ja->bio[i] = NULL;
}
kfree(ja->buckets);
kfree(ja->bucket_seq);
ja->buckets = NULL;
ja->bucket_seq = NULL;
}
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@ -1256,14 +1285,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
bch2_sb_field_get(sb, journal_v2);
unsigned i, nr_bvecs;
ja->nr = 0;
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
for (i = 0; i < nr; i++)
for (unsigned i = 0; i < nr; i++)
ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
} else if (journal_buckets) {
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@ -1273,13 +1301,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -BCH_ERR_ENOMEM_dev_journal_init;
nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!ca->journal.bio)
return -BCH_ERR_ENOMEM_dev_journal_init;
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
return -BCH_ERR_ENOMEM_dev_journal_init;
bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
ja->bio[i]->ca = ca;
ja->bio[i]->buf_idx = i;
bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
}
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
@ -1287,14 +1320,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
unsigned j, dst = 0;
unsigned dst = 0;
for (i = 0; i < nr; i++)
for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
for (unsigned i = 0; i < nr; i++)
for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
ja->buckets[dst++] =
le64_to_cpu(journal_buckets_v2->d[i].start) + j;
} else if (journal_buckets) {
for (i = 0; i < ja->nr; i++)
for (unsigned i = 0; i < ja->nr; i++)
ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
}
@ -1303,19 +1336,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
unsigned i;
if (j->wq)
destroy_workqueue(j->wq);
darray_exit(&j->early_journal_entries);
for (i = 0; i < ARRAY_SIZE(j->buf); i++)
kvpfree(j->buf[i].data, j->buf[i].buf_size);
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
kvfree(j->buf[i].data);
free_fifo(&j->pin);
}
int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
@ -1336,14 +1369,20 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
j->wq = alloc_workqueue("bcachefs_journal",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
if (!j->wq)
return -BCH_ERR_ENOMEM_fs_other_alloc;
return 0;
}
@ -1455,7 +1494,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
unsigned i;
spin_lock(&j->lock);
*seq = max(*seq, j->pin.front);
@ -1473,7 +1511,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
prt_newline(out);
printbuf_indent_add(out, 2);
for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
list_for_each_entry(pin, &pin_list->list[i], list) {
prt_printf(out, "\t%px %ps", pin, pin->flush);
prt_newline(out);

View File

@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
}
bool bch2_journal_entry_close(struct journal *);
void bch2_journal_buf_put_final(struct journal *, u64, bool);
void bch2_journal_do_writes(struct journal *);
void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
{
@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx))
bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
bch2_journal_buf_put_final(j, seq);
}
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx)) {
spin_lock(&j->lock);
bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
}
}

View File

@ -17,6 +17,38 @@
#include "sb-clean.h"
#include "trace.h"
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
u64 offset;
div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
if (i != j->ptrs.data)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
i->dev, i->bucket, i->bucket_offset, i->sector);
}
}
static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
bch2_journal_ptrs_to_text(out, c, j);
struct jset_entry *entry;
for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
break;
}
}
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@ -52,8 +84,7 @@ static void __journal_replay_free(struct bch_fs *c,
BUG_ON(*p != i);
*p = NULL;
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
kvfree(i);
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@ -84,9 +115,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
{
struct genradix_iter iter;
struct journal_replay **_i, *i, *dup;
struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
/* Is this entry older than the range we need? */
@ -131,72 +162,61 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
if (bytes == vstruct_bytes(&dup->j) &&
!memcmp(j, &dup->j, bytes)) {
i = dup;
goto found;
}
bool identical = bytes == vstruct_bytes(&dup->j) &&
!memcmp(j, &dup->j, bytes);
bool not_identical = !identical &&
entry_ptr.csum_good &&
dup->csum_good;
if (!entry_ptr.csum_good) {
i = dup;
goto found;
}
bool same_device = false;
darray_for_each(dup->ptrs, ptr)
if (ptr->dev == ca->dev_idx)
same_device = true;
if (!dup->csum_good)
ret = darray_push(&dup->ptrs, entry_ptr);
if (ret)
goto out;
bch2_journal_replay_to_text(&buf, c, dup);
fsck_err_on(same_device,
c, journal_entry_dup_same_device,
"duplicate journal entry on same device\n %s",
buf.buf);
fsck_err_on(not_identical,
c, journal_entry_replicas_data_mismatch,
"found duplicate but non identical journal entries\n %s",
buf.buf);
if (entry_ptr.csum_good && !identical)
goto replace;
fsck_err(c, journal_entry_replicas_data_mismatch,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
i = dup;
goto found;
goto out;
}
replace:
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
return -BCH_ERR_ENOMEM_journal_entry_add;
i->nr_ptrs = 0;
darray_init(&i->ptrs);
i->csum_good = entry_ptr.csum_good;
i->ignore = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
bch_err(c, "found too many copies of journal entry %llu",
le64_to_cpu(i->j.seq));
dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
}
/* The first ptr should represent the jset we kept: */
memcpy(i->ptrs + i->nr_ptrs,
dup->ptrs,
sizeof(dup->ptrs[0]) * dup->nr_ptrs);
i->nr_ptrs += dup->nr_ptrs;
darray_for_each(dup->ptrs, ptr)
darray_push(&i->ptrs, *ptr);
__journal_replay_free(c, dup);
} else {
darray_push(&i->ptrs, entry_ptr);
}
*_i = i;
return 0;
found:
for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
if (ptr->dev == ca->dev_idx) {
bch_err(c, "duplicate journal entry %llu on same device",
le64_to_cpu(i->j.seq));
goto out;
}
}
if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
bch_err(c, "found too many copies of journal entry %llu",
le64_to_cpu(i->j.seq));
goto out;
}
i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
printbuf_exit(&buf);
return ret;
}
@ -741,6 +761,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
journal_entry_btree_keys_to_text(out, c, entry);
}
static int journal_entry_datetime_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
enum bkey_invalid_flags flags)
{
unsigned bytes = vstruct_bytes(entry);
unsigned expected = 16;
int ret = 0;
if (journal_entry_err_on(vstruct_bytes(entry) < expected,
c, version, jset, entry,
journal_entry_dev_usage_bad_size,
"bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
}
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@ -913,11 +964,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
n = kvpmalloc(new_size, GFP_KERNEL);
n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
kvpfree(b->data, b->size);
kvfree(b->data);
b->data = n;
b->size = new_size;
return 0;
@ -1102,16 +1153,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!r)
continue;
for (i = 0; i < r->nr_ptrs; i++) {
if (r->ptrs[i].dev == ca->dev_idx) {
unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
darray_for_each(r->ptrs, i)
if (i->dev == ca->dev_idx) {
unsigned wrote = bucket_remainder(ca, i->sector) +
vstruct_sectors(&r->j, c->block_bits);
ja->cur_idx = r->ptrs[i].bucket;
ja->cur_idx = i->bucket;
ja->sectors_free = ca->mi.bucket_size - wrote;
goto found;
}
}
}
found:
mutex_unlock(&jlist->lock);
@ -1144,7 +1194,7 @@ found:
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvpfree(buf.data, buf.size);
kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
@ -1155,27 +1205,6 @@ err:
goto out;
}
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
unsigned i;
for (i = 0; i < j->nr_ptrs; i++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
u64 offset;
div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
if (i)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
j->ptrs[i].dev,
j->ptrs[i].bucket,
j->ptrs[i].bucket_offset,
j->ptrs[i].sector);
}
}
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
@ -1353,32 +1382,31 @@ int bch2_journal_read(struct bch_fs *c,
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
};
unsigned ptr;
i = *_i;
if (!i || i->ignore)
continue;
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
darray_for_each(i->ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (!i->ptrs[ptr].csum_good)
bch_err_dev_offset(ca, i->ptrs[ptr].sector,
if (!ptr->csum_good)
bch_err_dev_offset(ca, ptr->sector,
"invalid journal checksum, seq %llu%s",
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
bch_dev_bkey_exists(c, i->ptrs[0].dev),
bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
&i->j,
i->ptrs[0].sector,
i->ptrs.data[0].sector,
READ);
if (ret)
goto err;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
darray_for_each(i->ptrs, ptr)
replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
bch2_replicas_entry_sort(&replicas.e);
@ -1545,7 +1573,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@ -1556,7 +1584,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
swap(buf->buf_size, new_size);
spin_unlock(&j->lock);
kvpfree(new_buf, new_size);
kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@ -1566,17 +1594,17 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
static CLOSURE_CALLBACK(journal_write_done)
{
closure_type(j, struct journal, io);
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 v, seq;
u64 v, seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
? j->flush_write_time
: j->noflush_write_time, j->write_start_time);
time_stats_update(!JSET_NO_FLUSH(w->data)
? j->flush_write_time
: j->noflush_write_time, j->write_start_time);
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
@ -1591,63 +1619,68 @@ static CLOSURE_CALLBACK(journal_write_done)
if (err)
bch2_fatal_error(c);
spin_lock(&j->lock);
seq = le64_to_cpu(w->data->seq);
closure_debug_destroy(cl);
spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
if (err && (!j->err_seq || seq < j->err_seq))
j->err_seq = seq;
w->write_done = true;
if (!err) {
if (!JSET_NO_FLUSH(w->data)) {
bool completed = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++) {
w = j->buf + (seq & JOURNAL_BUF_MASK);
if (!w->write_done)
break;
if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
j->seq_ondisk = seq;
j->seq_ondisk = seq;
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
BUG_ON(journal_state_count(new, new.unwritten_idx));
BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
BUG_ON(journal_state_count(new, new.unwritten_idx));
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
completed = true;
}
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
if (completed) {
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, false);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
closure_wake_up(&w->wait);
journal_wake(j);
closure_wake_up(&w->wait);
journal_wake(j);
}
if (!journal_state_count(new, new.unwritten_idx) &&
journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
spin_unlock(&j->lock);
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@ -1657,46 +1690,46 @@ static CLOSURE_CALLBACK(journal_write_done)
* previous entries still in flight - the current journal entry
* might want to be written now:
*/
spin_unlock(&j->lock);
mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
} else {
spin_unlock(&j->lock);
mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
struct bch_dev *ca = jbio->ca;
struct journal *j = &ca->fs->journal;
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
closure_put(&j->io);
closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
}
static CLOSURE_CALLBACK(do_journal_write)
{
closure_type(j, struct journal, io);
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct journal_device *ja = &ca->journal;
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@ -1706,7 +1739,7 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
bio = ca->journal.bio;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@ -1725,11 +1758,10 @@ static CLOSURE_CALLBACK(do_journal_write)
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] =
le64_to_cpu(w->data->seq);
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
continue_at(cl, journal_write_done, c->io_complete_wq);
continue_at(cl, journal_write_done, j->wq);
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@ -1802,6 +1834,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
struct jset_entry_datetime *d =
container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
d->entry.type = BCH_JSET_ENTRY_datetime;
d->seconds = cpu_to_le64(ktime_get_real_seconds());
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@ -1901,16 +1938,16 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
CLOSURE_CALLBACK(bch2_journal_write)
{
closure_type(j, struct journal, io);
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
BUG_ON(w->write_allocated);
j->write_start_time = local_clock();
@ -1954,12 +1991,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
* bch2_journal_space_available():
*/
w->sectors = 0;
w->write_allocated = true;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@ -1983,25 +2022,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
if (!JSET_NO_FLUSH(w->data))
closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev,
REQ_OP_WRITE|REQ_PREFLUSH);
REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
continue_at(cl, do_journal_write, c->io_complete_wq);
continue_at(cl, do_journal_write, j->wq);
return;
no_io:
continue_at(cl, journal_write_done, c->io_complete_wq);
continue_at(cl, journal_write_done, j->wq);
return;
err:
bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
continue_at(cl, journal_write_done, j->wq);
}

View File

@ -2,19 +2,22 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
#include <linux/darray_types.h>
struct journal_ptr {
bool csum_good;
u8 dev;
u32 bucket;
u32 bucket_offset;
u64 sector;
};
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
struct journal_ptr {
bool csum_good;
u8 dev;
u32 bucket;
u32 bucket_offset;
u64 sector;
} ptrs[BCH_REPLICAS_MAX];
unsigned nr_ptrs;
DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
bool ignore;
@ -62,4 +65,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = cpu_to_le16(u64s - 1);
*end = vstruct_next(*end);
return entry;
}
#endif /* _BCACHEFS_JOURNAL_IO_H */

View File

@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
&j->low_on_space_start, low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
&j->low_on_pin_start, low_on_pin) ||
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
&j->write_buffer_full_start, low_on_wb))
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
bool reclaim;
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j,
return;
}
reclaim = __journal_pin_drop(j, dst);
bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
if (seq == journal_last_seq(j))
journal_wake(j);
spin_unlock(&j->lock);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
bool reclaim;
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
reclaim = __journal_pin_drop(j, pin);
bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
if (seq == journal_last_seq(j))
journal_wake(j);
spin_unlock(&j->lock);
}
/**

View File

@ -2,8 +2,8 @@
#include "bcachefs.h"
#include "journal_sb.h"
#include "darray.h"
#include <linux/darray.h>
#include <linux/sort.h>
/* BCH_SB_FIELD_journal: */

View File

@ -2,10 +2,11 @@
#include "bcachefs.h"
#include "btree_iter.h"
#include "eytzinger.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <linux/eytzinger.h>
/*
* journal_seq_blacklist machinery:
*
@ -119,8 +120,7 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l,
const void *_r, size_t size)
static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;

View File

@ -18,6 +18,7 @@
* the journal that are being staged or in flight.
*/
struct journal_buf {
struct closure io;
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@ -33,10 +34,14 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
bool need_flush_to_write_buffer;
bool noflush:1; /* write has already been kicked off, and was noflush */
bool must_flush:1; /* something wants a flush */
bool separate_flush:1;
bool need_flush_to_write_buffer:1;
bool write_started:1;
bool write_allocated:1;
bool write_done:1;
u8 idx;
};
/*
@ -134,6 +139,7 @@ enum journal_flags {
/* Reasons we may fail to get a journal reservation: */
#define JOURNAL_ERRORS() \
x(ok) \
x(retry) \
x(blocked) \
x(max_in_flight) \
x(journal_full) \
@ -149,6 +155,13 @@ enum journal_errors {
typedef DARRAY(u64) darray_u64;
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
struct bio bio;
};
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
@ -203,8 +216,8 @@ struct journal {
wait_queue_head_t wait;
struct closure_waitlist async_wait;
struct closure io;
struct delayed_work write_work;
struct workqueue_struct *wq;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@ -274,14 +287,9 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
u64 low_on_space_start;
u64 low_on_pin_start;
u64 max_in_flight_start;
u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
struct time_stats *flush_write_time;
struct time_stats *noflush_write_time;
struct time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map res_map;
@ -313,7 +321,7 @@ struct journal_device {
u64 *buckets;
/* Bio for journal reads/writes to this device */
struct bio *bio;
struct journal_bio *bio[JOURNAL_BUF_NR];
/* for bch_journal_read_device */
struct closure read;

View File

@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
return -EINVAL;
return -BCH_ERR_remove_would_lose_data;
return 0;
}
@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
return -BCH_ERR_remove_with_metadata_missing_unimplemented;
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@ -132,10 +132,8 @@ retry:
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
if (ret)
break;
}
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {

View File

@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
u64 start_time = local_clock();
__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
}
}

View File

@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"
#include <linux/sort.h>
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
static int bch2_memcmp(const void *l, const void *r, size_t size)
static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
bch2_memcmp, NULL);
sort_r(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
bch2_memcmp, NULL,
(void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =

View File

@ -3,9 +3,10 @@
#define _BCACHEFS_REPLICAS_H
#include "bkey.h"
#include "eytzinger.h"
#include "replicas_types.h"
#include <linux/eytzinger.h>
void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);

View File

@ -171,22 +171,6 @@ fsck_err:
return ERR_PTR(ret);
}
static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = cpu_to_le16(u64s - 1);
*end = vstruct_next(*end);
return entry;
}
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)

View File

@ -6,12 +6,13 @@
*/
#include "bcachefs.h"
#include "darray.h"
#include "recovery.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
#include <linux/darray.h>
#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
/*

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
#define _BCACHEFS_SB_ERRORS_TYPES_H
#include "darray.h"
#include <linux/darray_types.h>
#define BCH_SB_ERRS() \
x(clean_but_journal_not_empty, 0) \
@ -250,7 +250,10 @@
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243) \
x(unlinked_inode_not_on_deleted_list, 244) \
x(reflink_p_front_pad_bad, 245)
x(reflink_p_front_pad_bad, 245) \
x(journal_entry_dup_same_device, 246) \
x(inode_bi_subvol_missing, 247) \
x(inode_bi_subvol_wrong, 248)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
#include "darray.h"
#include <linux/darray.h>
extern char * const bch2_member_error_strs[];

View File

@ -160,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
}
static __always_inline int
bch2_hash_lookup(struct btree_trans *trans,
bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
unsigned flags)
unsigned flags, u32 snapshot)
{
struct bkey_s_c k;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
@ -194,6 +189,19 @@ bch2_hash_lookup(struct btree_trans *trans,
return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
}
static __always_inline int
bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
unsigned flags)
{
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
}
static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
@ -251,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
int bch2_hash_set_snapshot(struct btree_trans *trans,
int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
@ -320,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
snapshot, insert, str_hash_flags, 0);
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
bch2_hash_set_in_snapshot(trans, desc, info, inum,
snapshot, insert, str_hash_flags, 0);
}
static __always_inline

View File

@ -42,6 +42,36 @@ static int check_subvol(struct btree_trans *trans,
return ret ?: -BCH_ERR_transaction_restart_nested;
}
struct bch_inode_unpacked inode;
struct btree_iter inode_iter = {};
ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
(subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
0);
bch2_trans_iter_exit(trans, &inode_iter);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
le32_to_cpu(subvol.v->snapshot))) {
ret = bch2_subvolume_delete(trans, iter->pos.offset);
bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
return ret ?: -BCH_ERR_transaction_restart_nested;
}
if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
c, subvol_root_wrong_bi_subvol,
"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
inode.bi_inum, inode_iter.k.p.snapshot,
inode.bi_subvol, subvol.k->p.offset)) {
inode.bi_subvol = subvol.k->p.offset;
ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
if (ret)
goto err;
}
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree;
@ -73,6 +103,7 @@ static int check_subvol(struct btree_trans *trans,
}
}
err:
fsck_err:
return ret;
}

View File

@ -2,7 +2,6 @@
#ifndef _BCACHEFS_SUBVOLUME_H
#define _BCACHEFS_SUBVOLUME_H
#include "darray.h"
#include "subvolume_types.h"
enum bkey_invalid_flags;

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
#include "darray.h"
#include <linux/darray_types.h>
typedef DARRAY(u32) snapshot_id_list;

View File

@ -3,12 +3,12 @@
#define _BCACHEFS_SUPER_IO_H
#include "extents.h"
#include "eytzinger.h"
#include "super_types.h"
#include "super.h"
#include "sb-members.h"
#include <asm/byteorder.h>
#include <linux/eytzinger.h>
static inline bool bch2_version_compatible(u16 version)
{

View File

@ -67,6 +67,7 @@
#include <linux/percpu.h>
#include <linux/random.h>
#include <linux/sysfs.h>
#include <linux/thread_with_file.h>
#include <crypto/hash.h>
MODULE_LICENSE("GPL");
@ -95,16 +96,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
if (likely(!stdio)) {
vprintk(fmt, args);
} else {
unsigned long flags;
if (fmt[0] == KERN_SOH[0])
fmt += 2;
spin_lock_irqsave(&stdio->output_lock, flags);
prt_vprintf(&stdio->output_buf, fmt, args);
spin_unlock_irqrestore(&stdio->output_lock, flags);
wake_up(&stdio->output_wait);
stdio_redirect_vprintf(stdio, true, fmt, args);
}
va_end(args);
}
@ -520,7 +515,7 @@ static void __bch2_fs_free(struct bch_fs *c)
unsigned i;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
time_stats_exit(&c->times[i]);
bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
@ -576,7 +571,7 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
kvpfree(c, sizeof(*c));
kvfree(c);
module_put(THIS_MODULE);
}
@ -715,7 +710,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
unsigned i, iter_size;
int ret = 0;
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
@ -753,7 +748,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal_keys.initial_ref_held = true;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
time_stats_init(&c->times[i]);
bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@ -882,8 +877,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@ -1124,7 +1119,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
prt_str(&buf, "believes seq of ");
prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
@ -1168,8 +1163,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
bch2_time_stats_exit(&ca->io_latency[WRITE]);
bch2_time_stats_exit(&ca->io_latency[READ]);
time_stats_quantiles_exit(&ca->io_latency[WRITE]);
time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@ -1260,8 +1255,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_init(&ca->io_latency[READ]);
bch2_time_stats_init(&ca->io_latency[WRITE]);
time_stats_quantiles_init(&ca->io_latency[READ]);
time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);

View File

@ -930,10 +930,10 @@ SHOW(bch2_dev)
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
bch2_time_stats_to_text(out, &ca->io_latency[READ]);
bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
if (attr == &sysfs_io_latency_stats_write)
bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)

View File

@ -1,299 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
#include "printbuf.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
if (thr->task) {
kthread_stop(thr->task);
put_task_struct(thr->task);
}
}
int bch2_run_thread_with_file(struct thread_with_file *thr,
const struct file_operations *fops,
int (*fn)(void *))
{
struct file *file = NULL;
int ret, fd = -1;
unsigned fd_flags = O_CLOEXEC;
if (fops->read && fops->write)
fd_flags |= O_RDWR;
else if (fops->read)
fd_flags |= O_RDONLY;
else if (fops->write)
fd_flags |= O_WRONLY;
char name[TASK_COMM_LEN];
get_task_comm(name, current);
thr->ret = 0;
thr->task = kthread_create(fn, thr, "%s", name);
ret = PTR_ERR_OR_ZERO(thr->task);
if (ret)
return ret;
ret = get_unused_fd_flags(fd_flags);
if (ret < 0)
goto err;
fd = ret;
file = anon_inode_getfile(name, fops, thr, fd_flags);
ret = PTR_ERR_OR_ZERO(file);
if (ret)
goto err;
fd_install(fd, file);
get_task_struct(thr->task);
wake_up_process(thr->task);
return fd;
err:
if (fd >= 0)
put_unused_fd(fd);
if (thr->task)
kthread_stop(thr->task);
return ret;
}
static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
{
return thr->stdio.output_buf.pos ||
thr->output2.nr ||
thr->thr.done;
}
static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
size_t copied = 0, b;
int ret = 0;
if ((file->f_flags & O_NONBLOCK) &&
!thread_with_stdio_has_output(thr))
return -EAGAIN;
ret = wait_event_interruptible(thr->stdio.output_wait,
thread_with_stdio_has_output(thr));
if (ret)
return ret;
if (thr->thr.done)
return 0;
while (len) {
ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
if (ret)
break;
spin_lock_irq(&thr->stdio.output_lock);
b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
memmove(thr->stdio.output_buf.buf,
thr->stdio.output_buf.buf + b,
thr->stdio.output_buf.pos - b);
thr->output2.nr += b;
thr->stdio.output_buf.pos -= b;
spin_unlock_irq(&thr->stdio.output_lock);
b = min(len, thr->output2.nr);
if (!b)
break;
b -= copy_to_user(buf, thr->output2.data, b);
if (!b) {
ret = -EFAULT;
break;
}
copied += b;
buf += b;
len -= b;
memmove(thr->output2.data,
thr->output2.data + b,
thr->output2.nr - b);
thr->output2.nr -= b;
}
return copied ?: ret;
}
static int thread_with_stdio_release(struct inode *inode, struct file *file)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
bch2_thread_with_file_exit(&thr->thr);
printbuf_exit(&thr->stdio.input_buf);
printbuf_exit(&thr->stdio.output_buf);
darray_exit(&thr->output2);
thr->exit(thr);
return 0;
}
#define WRITE_BUFFER 4096
static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
{
return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
}
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
struct printbuf *buf = &thr->stdio.input_buf;
size_t copied = 0;
ssize_t ret = 0;
while (len) {
if (thr->thr.done) {
ret = -EPIPE;
break;
}
size_t b = len - fault_in_readable(ubuf, len);
if (!b) {
ret = -EFAULT;
break;
}
spin_lock(&thr->stdio.input_lock);
if (buf->pos < WRITE_BUFFER)
bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
b = min(len, printbuf_remaining_size(buf));
if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
ubuf += b;
len -= b;
copied += b;
buf->pos += b;
}
spin_unlock(&thr->stdio.input_lock);
if (b) {
wake_up(&thr->stdio.input_wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
ret = wait_event_interruptible(thr->stdio.input_wait,
thread_with_stdio_has_input_space(thr));
if (ret)
break;
}
}
return copied ?: ret;
}
static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
poll_wait(file, &thr->stdio.output_wait, wait);
poll_wait(file, &thr->stdio.input_wait, wait);
__poll_t mask = 0;
if (thread_with_stdio_has_output(thr))
mask |= EPOLLIN;
if (thread_with_stdio_has_input_space(thr))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
static const struct file_operations thread_with_stdio_fops = {
.release = thread_with_stdio_release,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
.llseek = no_llseek,
};
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
void (*exit)(struct thread_with_stdio *),
int (*fn)(void *))
{
thr->stdio.input_buf = PRINTBUF;
thr->stdio.input_buf.atomic++;
spin_lock_init(&thr->stdio.input_lock);
init_waitqueue_head(&thr->stdio.input_wait);
thr->stdio.output_buf = PRINTBUF;
thr->stdio.output_buf.atomic++;
spin_lock_init(&thr->stdio.output_lock);
init_waitqueue_head(&thr->stdio.output_wait);
darray_init(&thr->output2);
thr->exit = exit;
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
}
int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
{
wait_event(stdio->input_wait,
stdio->input_buf.pos || stdio->done);
if (stdio->done)
return -1;
spin_lock(&stdio->input_lock);
int ret = min(len, stdio->input_buf.pos);
stdio->input_buf.pos -= ret;
memcpy(buf, stdio->input_buf.buf, ret);
memmove(stdio->input_buf.buf,
stdio->input_buf.buf + ret,
stdio->input_buf.pos);
spin_unlock(&stdio->input_lock);
wake_up(&stdio->input_wait);
return ret;
}
int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
{
wait_event(stdio->input_wait,
stdio->input_buf.pos || stdio->done);
if (stdio->done)
return -1;
spin_lock(&stdio->input_lock);
int ret = min(len, stdio->input_buf.pos);
char *n = memchr(stdio->input_buf.buf, '\n', ret);
if (n)
ret = min(ret, n + 1 - stdio->input_buf.buf);
stdio->input_buf.pos -= ret;
memcpy(buf, stdio->input_buf.buf, ret);
memmove(stdio->input_buf.buf,
stdio->input_buf.buf + ret,
stdio->input_buf.pos);
spin_unlock(&stdio->input_lock);
wake_up(&stdio->input_wait);
return ret;
}
#endif /* NO_BCACHEFS_FS */

View File

@ -1,41 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_THREAD_WITH_FILE_H
#define _BCACHEFS_THREAD_WITH_FILE_H
#include "thread_with_file_types.h"
struct task_struct;
struct thread_with_file {
struct task_struct *task;
int ret;
bool done;
};
void bch2_thread_with_file_exit(struct thread_with_file *);
int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
DARRAY(char) output2;
void (*exit)(struct thread_with_stdio *);
};
static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
{
thr->thr.done = true;
thr->stdio.done = true;
wake_up(&thr->stdio.input_wait);
wake_up(&thr->stdio.output_wait);
}
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
void (*exit)(struct thread_with_stdio *),
int (*fn)(void *));
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */

View File

@ -1,16 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
struct stdio_redirect {
spinlock_t output_lock;
wait_queue_head_t output_wait;
struct printbuf output_buf;
spinlock_t input_lock;
wait_queue_head_t input_wait;
struct printbuf input_buf;
bool done;
};
#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */

View File

@ -11,6 +11,7 @@
#include <linux/console.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/eytzinger.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/log2.h>
@ -22,9 +23,8 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
#include <linux/mean_and_variance.h>
#include "eytzinger.h"
#include "mean_and_variance.h"
#include "util.h"
static const char si_units[] = "?kMGTPEZY";
@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif
static const struct time_unit {
const char *name;
u64 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
@ -370,120 +344,6 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct bch2_quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
u64 start, u64 end)
{
u64 duration, freq;
if (time_after64(end, start)) {
duration = end - start;
mean_and_variance_update(&stats->duration_stats, duration);
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
if (time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
stats->last_event = end;
}
}
static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
for (struct bch2_time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
bch2_time_stats_update_one(stats, i->start, i->end);
b->nr = 0;
}
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
WARN_ONCE(!stats->duration_stats_weighted.weight ||
!stats->freq_stats_weighted.weight,
"uninitialized time_stats");
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
bch2_time_stats_update_one(stats, start, end);
if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct bch2_time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct bch2_time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
.start = start,
.end = end
};
if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
bch2_time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
@ -503,19 +363,18 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
#define TABSTOP_SIZE 12
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
{
const struct time_unit *u;
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
@ -570,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
@ -593,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
printbuf_tabstops_reset(out);
i = eytzinger0_first(NR_QUANTILES);
u = pick_time_units(stats->quantiles.entries[i].m);
if (quantiles) {
int i = eytzinger0_first(NR_QUANTILES);
const struct time_unit *u =
pick_time_units(quantiles->entries[i].m);
u64 last_q = 0;
prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
q = max(stats->quantiles.entries[i].m, last_q);
prt_printf(out, "%llu ",
div_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
u64 q = max(quantiles->entries[i].m, last_q);
prt_printf(out, "%llu ", div_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
}
}
}
#else
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
}
void bch2_time_stats_init(struct bch2_time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
stats->duration_stats_weighted.weight = 8;
stats->freq_stats_weighted.weight = 8;
stats->min_duration = U64_MAX;
stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}
/* ratelimit: */
@ -863,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
static int alignment_ok(const void *base, size_t align)
{
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
((unsigned long)base & (align - 1)) == 0;
}
static void u32_swap(void *a, void *b, size_t size)
{
u32 t = *(u32 *)a;
*(u32 *)a = *(u32 *)b;
*(u32 *)b = t;
}
static void u64_swap(void *a, void *b, size_t size)
{
u64 t = *(u64 *)a;
*(u64 *)a = *(u64 *)b;
*(u64 *)b = t;
}
static void generic_swap(void *a, void *b, size_t size)
{
char t;
do {
t = *(char *)a;
*(char *)a++ = *(char *)b;
*(char *)b++ = t;
} while (--size > 0);
}
static inline int do_cmp(void *base, size_t n, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
size_t l, size_t r)
{
return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size);
}
static inline void do_swap(void *base, size_t n, size_t size,
void (*swap_func)(void *, void *, size_t),
size_t l, size_t r)
{
swap_func(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size);
}
void eytzinger0_sort(void *base, size_t n, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t))
{
int i, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
for (r = i; r * 2 + 1 < n; r = c) {
c = r * 2 + 1;
if (c + 1 < n &&
do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
c++;
if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
break;
do_swap(base, n, size, swap_func, r, c);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
do_swap(base, n, size, swap_func, 0, i);
for (r = 0; r * 2 + 1 < i; r = c) {
c = r * 2 + 1;
if (c + 1 < i &&
do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
c++;
if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
break;
do_swap(base, n, size, swap_func, r, c);
}
}
}
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t size))
{
/* pre-scale counters for performance */
int i = (num/2 - 1) * size, n = num * size, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for ( ; i >= 0; i -= size) {
for (r = i; r * 2 + size < n; r = c) {
c = r * 2 + size;
if (c < n - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
/* sort */
for (i = n - size; i > 0; i -= size) {
swap_func(base, base + i, size);
for (r = 0; r * 2 + size < i; r = c) {
c = r * 2 + size;
if (c < i - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
}
static void mempool_free_vp(void *element, void *pool_data)
{
size_t size = (size_t) pool_data;
vpfree(element, size);
}
static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t) pool_data;
return vpmalloc(size, gfp_mask);
}
int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return size < PAGE_SIZE
? mempool_init_kmalloc_pool(pool, min_nr, size)
: mempool_init(pool, min_nr, mempool_alloc_vp,
mempool_free_vp, (void *) size);
}
#if 0
void eytzinger1_test(void)
{

View File

@ -5,22 +5,21 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/closure.h>
#include <linux/darray.h>
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/ratelimit.h>
#include <linux/sched/clock.h>
#include <linux/slab.h>
#include <linux/time_stats.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include "mean_and_variance.h"
#include "darray.h"
#include <linux/mean_and_variance.h>
struct closure;
@ -53,38 +52,6 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
static inline void vpfree(void *p, size_t size)
{
if (is_vmalloc_addr(p))
vfree(p);
else
free_pages((unsigned long) p, get_order(size));
}
static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
__vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
{
if (size < PAGE_SIZE)
kfree(p);
else
vpfree(p, size);
}
static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
{
return size < PAGE_SIZE
? kmalloc(size, gfp_mask)
: vpmalloc(size, gfp_mask);
}
int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
#define HEAP(type) \
struct { \
size_t size, used; \
@ -97,13 +64,13 @@ struct { \
({ \
(heap)->used = 0; \
(heap)->size = (_size); \
(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
})
#define free_heap(heap) \
do { \
kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
@ -361,83 +328,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
struct bch2_quantiles {
struct bch2_quantile_entry {
u64 m;
u64 step;
} entries[NR_QUANTILES];
};
struct bch2_time_stat_buffer {
unsigned nr;
struct bch2_time_stat_buffer_entry {
u64 start;
u64 end;
} entries[32];
};
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
struct bch2_quantiles quantiles;
struct mean_and_variance duration_stats;
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance freq_stats;
struct mean_and_variance_weighted freq_stats_weighted;
struct bch2_time_stat_buffer __percpu *buffer;
};
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
if (v != !!*start) {
if (!v) {
bch2_time_stats_update(stats, *start);
*start = 0;
} else {
*start = local_clock() ?: 1;
return true;
}
}
return false;
}
#else
static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
bool ret = v && !*start;
*start = v;
return ret;
}
#endif
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
#define ewma_add(ewma, val, weight) \
({ \
@ -738,34 +629,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
&(_array)[(_pos)], \
sizeof((_array)[0]) * ((_nr) - (_pos)))
#define array_insert_item(_array, _nr, _pos, _new_item) \
do { \
__array_insert_item(_array, _nr, _pos); \
(_nr)++; \
(_array)[(_pos)] = (_new_item); \
} while (0)
#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
do { \
(_nr) -= (_nr_to_remove); \
memmove(&(_array)[(_pos)], \
&(_array)[(_pos) + (_nr_to_remove)], \
sizeof((_array)[0]) * ((_nr) - (_pos))); \
} while (0)
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
static inline void __move_gap(void *array, size_t element_size,
size_t nr, size_t size,
size_t old_gap, size_t new_gap)

View File

@ -1,10 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
/*
* (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
*/
#include <linux/darray.h>
#include <linux/log2.h>
#include <linux/slab.h>
#include "darray.h"
int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);

View File

@ -40,10 +40,9 @@
#include <linux/limits.h>
#include <linux/math.h>
#include <linux/math64.h>
#include <linux/mean_and_variance.h>
#include <linux/module.h>
#include "mean_and_variance.h"
u128_u u128_div(u128_u n, u64 d)
{
u128_u r;
@ -107,10 +106,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
*/
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
s64 x, bool initted, u8 weight)
{
// previous weighted variance.
u8 w = s->weight;
u8 w = weight;
u64 var_w0 = s->variance;
// new value weighted.
s64 x_w = x << w;
@ -119,14 +119,13 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
// new mean weighted.
s64 u_w1 = s->mean + diff;
if (!s->init) {
if (!initted) {
s->mean = x_w;
s->variance = 0;
} else {
s->mean = u_w1;
s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
}
s->init = true;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
@ -134,9 +133,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
* mean_and_variance_weighted_get_mean() - get mean from @s
* @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
u8 weight)
{
return fast_divpow2(s.mean, s.weight);
return fast_divpow2(s.mean, weight);
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
@ -144,10 +144,11 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
* mean_and_variance_weighted_get_variance() -- get variance from @s
* @s: mean and variance number of samples and their sums
*/
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
u8 weight)
{
// always positive don't need fast divpow2
return s.variance >> s.weight;
return s.variance >> weight;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
@ -155,9 +156,10 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
* @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
u8 weight)
{
return int_sqrt64(mean_and_variance_weighted_get_variance(s));
return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);

View File

@ -522,6 +522,19 @@ void mempool_kfree(void *element, void *pool_data)
}
EXPORT_SYMBOL(mempool_kfree);
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)pool_data;
return kvmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kvmalloc);
void mempool_kvfree(void *element, void *pool_data)
{
kvfree(element);
}
EXPORT_SYMBOL(mempool_kvfree);
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.

368
linux/sort.c Normal file
View File

@ -0,0 +1,368 @@
// SPDX-License-Identifier: GPL-2.0
/*
* A fast, small, non-recursive O(n log n) sort for the Linux kernel
*
* This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
* and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
*
* Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
* better) at the expense of stack usage and much larger code to avoid
* quicksort's O(n^2) worst case.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/export.h>
#include <linux/sort.h>
/**
* is_aligned - is this pointer & size okay for word-wide copying?
* @base: pointer to data
* @size: size of each element
* @align: required alignment (typically 4 or 8)
*
* Returns true if elements can be copied using word loads and stores.
* The size must be a multiple of the alignment, and the base address must
* be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
*
* For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
* to "if ((a | b) & mask)", so we do that by hand.
*/
__attribute_const__ __always_inline
static bool is_aligned(const void *base, size_t size, unsigned char align)
{
unsigned char lsbits = (unsigned char)size;
(void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
lsbits |= (unsigned char)(uintptr_t)base;
#endif
return (lsbits & (align - 1)) == 0;
}
/**
* swap_words_32 - swap two elements in 32-bit chunks
* @a: pointer to the first element to swap
* @b: pointer to the second element to swap
* @n: element size (must be a multiple of 4)
*
* Exchange the two objects in memory. This exploits base+index addressing,
* which basically all CPUs have, to minimize loop overhead computations.
*
* For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
* bottom of the loop, even though the zero flag is still valid from the
* subtract (since the intervening mov instructions don't alter the flags).
* Gcc 8.1.0 doesn't have that problem.
*/
static void swap_words_32(void *a, void *b, size_t n)
{
do {
u32 t = *(u32 *)(a + (n -= 4));
*(u32 *)(a + n) = *(u32 *)(b + n);
*(u32 *)(b + n) = t;
} while (n);
}
/**
* swap_words_64 - swap two elements in 64-bit chunks
* @a: pointer to the first element to swap
* @b: pointer to the second element to swap
* @n: element size (must be a multiple of 8)
*
* Exchange the two objects in memory. This exploits base+index
* addressing, which basically all CPUs have, to minimize loop overhead
* computations.
*
* We'd like to use 64-bit loads if possible. If they're not, emulating
* one requires base+index+4 addressing which x86 has but most other
* processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
* but it's possible to have 64-bit loads without 64-bit pointers (e.g.
* x32 ABI). Are there any cases the kernel needs to worry about?
*/
static void swap_words_64(void *a, void *b, size_t n)
{
do {
#ifdef CONFIG_64BIT
u64 t = *(u64 *)(a + (n -= 8));
*(u64 *)(a + n) = *(u64 *)(b + n);
*(u64 *)(b + n) = t;
#else
/* Use two 32-bit transfers to avoid base+index+4 addressing */
u32 t = *(u32 *)(a + (n -= 4));
*(u32 *)(a + n) = *(u32 *)(b + n);
*(u32 *)(b + n) = t;
t = *(u32 *)(a + (n -= 4));
*(u32 *)(a + n) = *(u32 *)(b + n);
*(u32 *)(b + n) = t;
#endif
} while (n);
}
/**
* swap_bytes - swap two elements a byte at a time
* @a: pointer to the first element to swap
* @b: pointer to the second element to swap
* @n: element size
*
* This is the fallback if alignment doesn't allow using larger chunks.
*/
static void swap_bytes(void *a, void *b, size_t n)
{
do {
char t = ((char *)a)[--n];
((char *)a)[n] = ((char *)b)[n];
((char *)b)[n] = t;
} while (n);
}
/*
* The values are arbitrary as long as they can't be confused with
* a pointer, but small integers make for the smallest compare
* instructions.
*/
#define SWAP_WORDS_64 (swap_r_func_t)0
#define SWAP_WORDS_32 (swap_r_func_t)1
#define SWAP_BYTES (swap_r_func_t)2
#define SWAP_WRAPPER (swap_r_func_t)3
struct wrapper {
cmp_func_t cmp;
swap_func_t swap;
};
/*
* The function pointer is last to make tail calls most efficient if the
* compiler decides not to inline this function.
*/
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
if (swap_func == SWAP_WRAPPER) {
((const struct wrapper *)priv)->swap(a, b, (int)size);
return;
}
if (swap_func == SWAP_WORDS_64)
swap_words_64(a, b, size);
else if (swap_func == SWAP_WORDS_32)
swap_words_32(a, b, size);
else if (swap_func == SWAP_BYTES)
swap_bytes(a, b, size);
else
swap_func(a, b, (int)size, priv);
}
#define _CMP_WRAPPER ((cmp_r_func_t)0L)
static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
{
if (cmp == _CMP_WRAPPER)
return ((const struct wrapper *)priv)->cmp(a, b);
return cmp(a, b, priv);
}
/**
* parent - given the offset of the child, find the offset of the parent.
* @i: the offset of the heap element whose parent is sought. Non-zero.
* @lsbit: a precomputed 1-bit mask, equal to "size & -size"
* @size: size of each element
*
* In terms of array indexes, the parent of element j = @i/@size is simply
* (j-1)/2. But when working in byte offsets, we can't use implicit
* truncation of integer divides.
*
* Fortunately, we only need one bit of the quotient, not the full divide.
* @size has a least significant bit. That bit will be clear if @i is
* an even multiple of @size, and set if it's an odd multiple.
*
* Logically, we're doing "if (i & lsbit) i -= size;", but since the
* branch is unpredictable, it's done with a bit of clever branch-free
* code instead.
*/
__attribute_const__ __always_inline
static size_t parent(size_t i, unsigned int lsbit, size_t size)
{
i -= size;
i -= size & -(i & lsbit);
return i / 2;
}
/**
* sort_r - sort an array of elements
* @base: pointer to data to sort
* @num: number of elements
* @size: size of each element
* @cmp_func: pointer to comparison function
* @swap_func: pointer to swap function or NULL
* @priv: third argument passed to comparison function
*
* This function does a heapsort on the given array. You may provide
* a swap_func function if you need to do something more than a memory
* copy (e.g. fix up pointers or auxiliary data), but the built-in swap
* avoids a slow retpoline and so is significantly faster.
*
* Sorting time is O(n log n) both on average and worst-case. While
* quicksort is slightly faster on average, it suffers from exploitable
* O(n*n) worst-case behavior and extra memory requirements that make
* it less suitable for kernel use.
*/
void sort_r(void *base, size_t num, size_t size,
cmp_r_func_t cmp_func,
swap_r_func_t swap_func,
const void *priv)
{
/* pre-scale counters for performance */
size_t n = num * size, a = (num/2) * size;
const unsigned int lsbit = size & -size; /* Used to find parent */
if (!a) /* num < 2 || size == 0 */
return;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
swap_func = NULL;
if (!swap_func) {
if (is_aligned(base, size, 8))
swap_func = SWAP_WORDS_64;
else if (is_aligned(base, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/*
* Loop invariants:
* 1. elements [a,n) satisfy the heap property (compare greater than
* all of their children),
* 2. elements [n,num*size) are sorted, and
* 3. a <= b <= c <= d <= n (whenever they are valid).
*/
for (;;) {
size_t b, c, d;
if (a) /* Building heap: sift down --a */
a -= size;
else if (n -= size) /* Sorting: Extract root to --n */
do_swap(base, base + n, size, swap_func, priv);
else /* Sort complete */
break;
/*
* Sift element at "a" down into heap. This is the
* "bottom-up" variant, which significantly reduces
* calls to cmp_func(): we find the sift-down path all
* the way to the leaves (one compare per level), then
* backtrack to find where to insert the target element.
*
* Because elements tend to sift down close to the leaves,
* this uses fewer compares than doing two per level
* on the way down. (A bit more than half as many on
* average, 3/4 worst-case.)
*/
for (b = a; c = 2*b + size, (d = c + size) < n;)
b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
if (d == n) /* Special case last leaf with no sibling */
b = c;
/* Now backtrack from "b" to the correct location for "a" */
while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
b = parent(b, lsbit, size);
c = b; /* Where "a" belongs */
while (b != a) { /* Shift it into place */
b = parent(b, lsbit, size);
do_swap(base + b, base + c, size, swap_func, priv);
}
}
}
EXPORT_SYMBOL(sort_r);
#include <linux/eytzinger.h>
static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func, const void *priv,
size_t l, size_t r)
{
return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
cmp_func, priv);
}
static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
swap_r_func_t swap_func, const void *priv,
size_t l, size_t r)
{
do_swap(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size, swap_func, priv);
}
void eytzinger0_sort_r(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func,
swap_r_func_t swap_func,
const void *priv)
{
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
swap_func = NULL;
if (!swap_func) {
if (is_aligned(base, size, 8))
swap_func = SWAP_WORDS_64;
else if (is_aligned(base, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
for (r = i; r * 2 + 1 < n; r = c) {
c = r * 2 + 1;
if (c + 1 < n &&
eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
c++;
if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
break;
eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
for (r = 0; r * 2 + 1 < i; r = c) {
c = r * 2 + 1;
if (c + 1 < i &&
eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
c++;
if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
break;
eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
}
}
}
EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
void eytzinger0_sort(void *base, size_t n, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func)
{
struct wrapper w = {
.cmp = cmp_func,
.swap = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
EXPORT_SYMBOL_GPL(eytzinger0_sort);

373
linux/time_stats.c Normal file
View File

@ -0,0 +1,373 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/eytzinger.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/time.h>
#include <linux/time_stats.h>
#include <linux/spinlock.h>
static const struct time_unit time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "d", (u64) NSEC_PER_SEC * 3600 * 24},
{ "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
{ "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
{ "eon", U64_MAX },
};
const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
EXPORT_SYMBOL_GPL(pick_time_units);
static void quantiles_update(struct quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void time_stats_update_one(struct time_stats *stats,
u64 start, u64 end)
{
u64 duration, freq;
bool initted = stats->last_event != 0;
if (time_after64(end, start)) {
struct quantiles *quantiles = time_stats_to_quantiles(stats);
duration = end - start;
mean_and_variance_update(&stats->duration_stats, duration);
mean_and_variance_weighted_update(&stats->duration_stats_weighted,
duration, initted, TIME_STATS_MV_WEIGHT);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
if (quantiles)
quantiles_update(quantiles, duration);
}
if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted,
freq, initted, TIME_STATS_MV_WEIGHT);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
}
stats->last_event = end;
}
void __time_stats_clear_buffer(struct time_stats *stats,
struct time_stat_buffer *b)
{
for (struct time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
time_stats_update_one(stats, i->start, i->end);
b->nr = 0;
}
EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
static noinline void time_stats_clear_buffer(struct time_stats *stats,
struct time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
time_stats_update_one(stats, start, end);
if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct time_stat_buffer_entry) {
.start = start,
.end = end
};
if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
EXPORT_SYMBOL_GPL(__time_stats_update);
#include <linux/seq_buf.h>
static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
}
static inline u64 time_stats_lifetime(const struct time_stats *stats)
{
return local_clock() - stats->start_time;
}
void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats,
const char *epoch_name, unsigned int flags)
{
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
u64 f_stddev = 0, d_stddev = 0;
u64 lifetime = time_stats_lifetime(stats);
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
if (stats->freq_stats.n) {
/* avoid divide by zero */
f_mean = mean_and_variance_get_mean(stats->freq_stats);
f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
d_mean = mean_and_variance_get_mean(stats->duration_stats);
d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
/* unless we didn't want zeroes anyway */
return;
}
seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
seq_buf_printf(out, "lifetime: ");
seq_buf_time_units_aligned(out, lifetime);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " since %-12s recent\n", epoch_name);
seq_buf_printf(out, "duration of events\n");
seq_buf_printf(out, " min: ");
seq_buf_time_units_aligned(out, stats->min_duration);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " max: ");
seq_buf_time_units_aligned(out, stats->max_duration);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " total: ");
seq_buf_time_units_aligned(out, stats->total_duration);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " mean: ");
seq_buf_time_units_aligned(out, d_mean);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
seq_buf_printf(out, " stddev: ");
seq_buf_time_units_aligned(out, d_stddev);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
seq_buf_printf(out, "time between events\n");
seq_buf_printf(out, " min: ");
seq_buf_time_units_aligned(out, stats->min_freq);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " max: ");
seq_buf_time_units_aligned(out, stats->max_freq);
seq_buf_printf(out, "\n");
seq_buf_printf(out, " mean: ");
seq_buf_time_units_aligned(out, f_mean);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
seq_buf_printf(out, " stddev: ");
seq_buf_time_units_aligned(out, f_stddev);
seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
seq_buf_printf(out, "\n");
if (quantiles) {
int i = eytzinger0_first(NR_QUANTILES);
const struct time_unit *u =
pick_time_units(quantiles->entries[i].m);
u64 last_q = 0;
seq_buf_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
if (is_last)
seq_buf_printf(out, "\n");
last_q = q;
}
}
}
EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
void time_stats_to_json(struct seq_buf *out, struct time_stats *stats,
const char *epoch_name, unsigned int flags)
{
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
if (stats->freq_stats.n) {
/* avoid divide by zero */
f_mean = mean_and_variance_get_mean(stats->freq_stats);
f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
d_mean = mean_and_variance_get_mean(stats->duration_stats);
d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
} else if (flags & TIME_STATS_PRINT_NO_ZEROES) {
/* unless we didn't want zeroes anyway */
return;
}
seq_buf_printf(out, "{\n");
seq_buf_printf(out, " \"epoch\": \"%s\",\n", epoch_name);
seq_buf_printf(out, " \"count\": %llu,\n", stats->duration_stats.n);
seq_buf_printf(out, " \"duration_ns\": {\n");
seq_buf_printf(out, " \"min\": %llu,\n", stats->min_duration);
seq_buf_printf(out, " \"max\": %llu,\n", stats->max_duration);
seq_buf_printf(out, " \"total\": %llu,\n", stats->total_duration);
seq_buf_printf(out, " \"mean\": %llu,\n", d_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev);
seq_buf_printf(out, " },\n");
d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT);
seq_buf_printf(out, " \"duration_ewma_ns\": {\n");
seq_buf_printf(out, " \"mean\": %llu,\n", d_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev);
seq_buf_printf(out, " },\n");
seq_buf_printf(out, " \"frequency_ns\": {\n");
seq_buf_printf(out, " \"min\": %llu,\n", stats->min_freq);
seq_buf_printf(out, " \"max\": %llu,\n", stats->max_freq);
seq_buf_printf(out, " \"mean\": %llu,\n", f_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev);
seq_buf_printf(out, " },\n");
f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT);
seq_buf_printf(out, " \"frequency_ewma_ns\": {\n");
seq_buf_printf(out, " \"mean\": %llu,\n", f_mean);
seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev);
if (quantiles) {
u64 last_q = 0;
/* close frequency_ewma_ns but signal more items */
seq_buf_printf(out, " },\n");
seq_buf_printf(out, " \"quantiles_ns\": [\n");
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
seq_buf_printf(out, " %llu", q);
if (!is_last)
seq_buf_printf(out, ", ");
last_q = q;
}
seq_buf_printf(out, " ]\n");
} else {
/* close frequency_ewma_ns without dumping further */
seq_buf_printf(out, " }\n");
}
seq_buf_printf(out, "}\n");
}
EXPORT_SYMBOL_GPL(time_stats_to_json);
void time_stats_exit(struct time_stats *stats)
{
free_percpu(stats->buffer);
}
EXPORT_SYMBOL_GPL(time_stats_exit);
void time_stats_init(struct time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
stats->min_duration = U64_MAX;
stats->min_freq = U64_MAX;
stats->start_time = local_clock();
spin_lock_init(&stats->lock);
}
EXPORT_SYMBOL_GPL(time_stats_init);
MODULE_AUTHOR("Kent Overstreet");
MODULE_LICENSE("GPL");

View File

@ -61,7 +61,7 @@ impl BcachefsHandle {
pub fn create_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string");
self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
dirfd: libc::AT_FDCWD,
dirfd: libc::AT_FDCWD as u32,
mode: 0o777,
dst_ptr: dst.as_ptr() as u64,
..Default::default()
@ -73,7 +73,7 @@ impl BcachefsHandle {
pub fn delete_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
let dst = CString::new(dst.as_ref().as_os_str().as_bytes()).expect("Failed to cast destination path for subvolume in a C-style string");
self.ioctl(BcachefsIoctl::SubvolumeDestroy, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
dirfd: libc::AT_FDCWD,
dirfd: libc::AT_FDCWD as u32,
mode: 0o777,
dst_ptr: dst.as_ptr() as u64,
..Default::default()
@ -88,7 +88,7 @@ impl BcachefsHandle {
let res = self.ioctl(BcachefsIoctl::SubvolumeCreate, &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
flags: BCH_SUBVOL_SNAPSHOT_CREATE | extra_flags,
dirfd: libc::AT_FDCWD,
dirfd: libc::AT_FDCWD as u32,
mode: 0o777,
src_ptr: src.as_ref().map_or(0, |x| x.as_ptr() as u64),
//src_ptr: if let Some(src) = src { src.as_ptr() } else { std::ptr::null() } as u64,