Update bcachefs sources to 47ffed9fad bcachefs: bch2_btree_delete_range_trans() now uses peek_upto()

This commit is contained in:
Kent Overstreet 2022-10-09 23:27:41 -04:00
parent 8d6138baac
commit 72add8822c
24 changed files with 518 additions and 181 deletions

View File

@ -1 +1 @@
cbccc6d8692fdd3af7d5db97a065af5a47bc733c
47ffed9fad891300a610191602a10ecd1e857cce

25
include/linux/mm.h Normal file
View File

@ -0,0 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TOOLS_LINUX_MM_H
#define _TOOLS_LINUX_MM_H
#include <linux/types.h>
struct sysinfo {
long uptime; /* Seconds since boot */
unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
unsigned long totalram; /* Total usable main memory size */
unsigned long freeram; /* Available memory size */
unsigned long sharedram; /* Amount of shared memory */
unsigned long bufferram; /* Memory used by buffers */
unsigned long totalswap; /* Total swap space size */
unsigned long freeswap; /* swap space still available */
__u16 procs; /* Number of current processes */
__u16 pad; /* Explicit padding for m68k */
unsigned long totalhigh; /* Total high memory size */
unsigned long freehigh; /* Available high memory size */
__u32 mem_unit; /* Memory unit size in bytes */
};
extern void si_meminfo(struct sysinfo * val);
#endif /* _TOOLS_LINUX_MM_H */

View File

@ -19,6 +19,7 @@ static inline void init_rwsem(struct rw_semaphore *lock)
}
#define down_read(l) pthread_rwlock_rdlock(&(l)->lock)
#define down_read_killable(l) (pthread_rwlock_rdlock(&(l)->lock), 0)
#define down_read_trylock(l) (!pthread_rwlock_tryrdlock(&(l)->lock))
#define up_read(l) pthread_rwlock_unlock(&(l)->lock)

View File

@ -7,6 +7,7 @@
#include <linux/bug.h>
#include <linux/completion.h>
#include <linux/jiffies.h>
#include <linux/rwsem.h>
#include <linux/time64.h>
#define TASK_RUNNING 0
@ -88,6 +89,10 @@ struct task_struct {
pid_t pid;
struct bio_list *bio_list;
struct signal_struct {
struct rw_semaphore exec_update_lock;
} *signal, _signal;
};
extern __thread struct task_struct *current;
@ -157,4 +162,11 @@ static inline void ktime_get_coarse_real_ts64(struct timespec64 *ts)
#define current_kernel_time64() current_kernel_time()
#define CURRENT_TIME (current_kernel_time())
static inline unsigned int stack_trace_save_tsk(struct task_struct *task,
unsigned long *store, unsigned int size,
unsigned int skipnr)
{
return 0;
}
#endif /* __TOOLS_LINUX_SCHED_H */

View File

@ -6,6 +6,8 @@
#include "btree_update.h"
#include "error.h"
#include <linux/mm.h>
#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
/*
@ -802,6 +804,103 @@ err:
return ret;
}
struct bbpos {
enum btree_id btree;
struct bpos pos;
};
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
}
static inline struct bbpos bbpos_successor(struct bbpos pos)
{
if (bpos_cmp(pos.pos, SPOS_MAX)) {
pos.pos = bpos_successor(pos.pos);
return pos;
}
if (pos.btree != BTREE_ID_NR) {
pos.btree++;
pos.pos = POS_MIN;
return pos;
}
BUG();
}
#if 0
static void bbpos_to_text(struct printbuf *out, struct bbpos pos)
{
prt_str(out, bch2_btree_ids[pos.btree]);
prt_char(out, ':');
bch2_bpos_to_text(out, pos.pos);
}
#endif
static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
{
return (struct bbpos) {
.btree = bp.btree_id,
.pos = bp.pos,
};
}
int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
unsigned btree_leaf_mask,
unsigned btree_interior_mask,
struct bbpos start, struct bbpos *end)
{
struct btree_iter iter;
struct bkey_s_c k;
struct sysinfo i;
size_t btree_nodes;
enum btree_id btree;
int ret = 0;
si_meminfo(&i);
btree_nodes = (i.totalram >> 1) / btree_bytes(trans->c);
for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
if (!((1U << btree) & btree_leaf_mask) &&
!((1U << btree) & btree_interior_mask))
continue;
bch2_trans_node_iter_init(trans, &iter, btree,
btree == start.btree ? start.pos : POS_MIN,
0, depth, 0);
/*
* for_each_btree_key_contineu() doesn't check the return value
* from bch2_btree_iter_advance(), which is needed when
* iterating over interior nodes where we'll see keys at
* SPOS_MAX:
*/
do {
k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
ret = bkey_err(k);
if (!k.k || ret)
break;
--btree_nodes;
if (!btree_nodes) {
end->btree = btree;
end->pos = k.k->p;
bch2_trans_iter_exit(trans, &iter);
return 0;
}
} while (bch2_btree_iter_advance(&iter));
bch2_trans_iter_exit(trans, &iter);
}
end->btree = BTREE_ID_NR;
end->pos = POS_MIN;
return ret;
}
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans trans;
@ -845,19 +944,26 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
static int check_one_backpointer(struct btree_trans *trans,
struct bpos bucket,
u64 *bp_offset)
u64 *bp_offset,
struct bbpos start,
struct bbpos end)
{
struct btree_iter iter;
struct bch_backpointer bp;
struct bbpos pos;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
int ret;
ret = bch2_get_next_backpointer(trans, bucket, -1,
bp_offset, &bp);
ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp);
if (ret || *bp_offset == U64_MAX)
return ret;
pos = bp_to_bbpos(bp);
if (bbpos_cmp(pos, start) < 0 ||
bbpos_cmp(pos, end) > 0)
return 0;
k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
ret = bkey_err(k);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
@ -880,29 +986,52 @@ fsck_err:
return ret;
}
int bch2_check_backpointers_to_extents(struct bch_fs *c)
static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
u64 bp_offset = 0;
while (!(ret = commit_do(&trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
while (!(ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
bp_offset < U64_MAX)
bp_offset++;
if (ret)
break;
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_trans_iter_exit(trans, &iter);
return ret < 0 ? ret : 0;
}
int bch2_check_backpointers_to_extents(struct bch_fs *c)
{
struct btree_trans trans;
struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
int ret;
bch2_trans_init(&trans, c, 0, 0);
while (1) {
ret = bch2_get_btree_in_memory_pos(&trans,
(1U << BTREE_ID_extents)|
(1U << BTREE_ID_reflink),
~0,
start, &end) ?:
bch2_check_backpointers_to_extents_pass(&trans, start, end);
if (ret || end.btree == BTREE_ID_NR)
break;
start = bbpos_successor(end);
}
bch2_trans_exit(&trans);
return ret;
}

View File

@ -1913,6 +1913,8 @@ do_write:
u64s = bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
set_needs_whiteout(i, false);
/* do we have data to write? */
@ -1922,6 +1924,10 @@ do_write:
bytes_to_write = vstruct_end(i) - data;
sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
@ -2010,11 +2016,6 @@ do_write:
b->written += sectors_to_write;
if (wbio->wbio.first_btree_write &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
cpu_to_le16(b->written);
if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);
@ -2027,10 +2028,6 @@ do_write:
return;
err:
set_btree_node_noevict(b);
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
cpu_to_le16(sectors_to_write);
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);

View File

@ -1850,10 +1850,12 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
struct bkey_s_c k, k2;
int ret;
EBUG_ON(iter->path->cached || iter->path->level);
EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
while (1) {
struct btree_path_level *l;
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@ -1866,9 +1868,18 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
l = path_l(iter->path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
goto out;
}
btree_path_set_should_be_locked(iter->path);
k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
k.k &&
@ -1889,7 +1900,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
: NULL;
if (next_update &&
bpos_cmp(next_update->k.p,
k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
k.k ? k.k->p : l->b->key.k.p) <= 0) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
}
@ -1910,9 +1921,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (likely(k.k)) {
break;
} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
} else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
search_key = bpos_successor(iter->path->l[0].b->key.k.p);
search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);

View File

@ -96,25 +96,26 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
int ret;
if (i == g->g) {
trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
} else {
i->trans->lock_must_abort = true;
ret = 0;
}
for (i = g->g + 1; i < g->g + g->nr; i++)
wake_up_process(i->trans->locking_wait.task);
return ret;
return 0;
}
}
static noinline int break_cycle(struct lock_graph *g)
{
struct trans_waiting_for_lock *i;
/*
* We'd like to prioritize aborting transactions that have done less
* work - but it appears breaking cycles by telling other transactions
* to abort may still be buggy:
*/
#if 0
for (i = g->g; i < g->g + g->nr; i++) {
if (i->trans->lock_may_not_fail ||
i->trans->locking_wait.lock_want == SIX_LOCK_write)
@ -130,7 +131,7 @@ static noinline int break_cycle(struct lock_graph *g)
return abort_lock(g, i);
}
#endif
for (i = g->g; i < g->g + g->nr; i++) {
if (i->trans->lock_may_not_fail)
continue;
@ -138,7 +139,29 @@ static noinline int break_cycle(struct lock_graph *g)
return abort_lock(g, i);
}
BUG();
{
struct bch_fs *c = g->g->trans->c;
struct printbuf buf = PRINTBUF;
bch_err(c, "cycle of nofail locks");
for (i = g->g; i < g->g + g->nr; i++) {
struct btree_trans *trans = i->trans;
bch2_btree_trans_to_text(&buf, trans);
prt_printf(&buf, "backtrace:");
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
bch2_prt_backtrace(&buf, trans->locking_wait.task);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
BUG();
}
}
static void lock_graph_pop(struct lock_graph *g)

View File

@ -8,8 +8,8 @@
struct bch_fs;
struct btree;
void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
struct btree *);
void bch2_btree_node_prep_for_write(struct btree_trans *,
struct btree_path *, struct btree *);
bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct btree *, struct btree_node_iter *,
struct bkey_i *);

View File

@ -23,9 +23,9 @@
#include <linux/random.h>
#include <trace/events/bcachefs.h>
static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
struct btree_path *, struct btree *,
struct keylist *, unsigned);
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
struct btree_path *, struct btree *,
struct keylist *, unsigned);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
@ -37,8 +37,8 @@ static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
path = bch2_path_get(trans, btree_id, pos, level + 1, level,
BTREE_ITER_NOPRESERVE|
BTREE_ITER_INTENT, _THIS_IP_);
path = bch2_btree_path_make_mut(trans, path, true, _THIS_IP_);
BTREE_ITER_INTENT, _RET_IP_);
path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
bch2_btree_path_downgrade(trans, path);
__bch2_btree_path_unlock(trans, path);
return path;
@ -195,6 +195,43 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
}
}
static void bch2_btree_node_free_never_used(struct btree_update *as,
struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
struct btree_path *path;
unsigned level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
b->will_make_reachable = 0;
closure_put(&as->cl);
clear_btree_node_will_make_reachable(b);
clear_btree_node_accessed(b);
clear_btree_node_dirty_acct(c, b);
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
BUG_ON(p->nr >= ARRAY_SIZE(p->b));
p->b[p->nr++] = b;
six_unlock_intent(&b->c.lock);
trans_for_each_path(trans, path)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
}
}
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
@ -392,8 +429,6 @@ static struct btree *__btree_root_alloc(struct btree_update *as,
btree_node_set_format(b, b->data->format);
bch2_btree_build_aux_trees(b);
bch2_btree_update_add_new_node(as, b);
six_unlock_write(&b->c.lock);
return b;
@ -859,6 +894,14 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
mutex_unlock(&c->btree_interior_update_lock);
btree_update_add_key(as, &as->new_keys, b);
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
cpu_to_le16(sectors);
}
}
/*
@ -1026,24 +1069,24 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
nr_nodes[!!update_level] += 1 + split;
update_level++;
if (!btree_path_node(path, update_level))
ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
if (ret)
return ERR_PTR(ret);
if (!btree_path_node(path, update_level)) {
/* Allocating new root? */
nr_nodes[1] += split;
update_level = BTREE_MAX_DEPTH;
break;
}
if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
break;
/*
* XXX: figure out how far we might need to split,
* instead of locking/reserving all the way to the root:
*/
split = update_level + 1 < BTREE_MAX_DEPTH;
split = true;
}
/* Might have to allocate a new root: */
if (update_level < BTREE_MAX_DEPTH)
nr_nodes[1] += 1;
ret = bch2_btree_path_upgrade(trans, path, U8_MAX);
if (ret)
return ERR_PTR(ret);
if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
else if (!down_read_trylock(&c->gc_lock)) {
@ -1064,6 +1107,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@ -1191,7 +1235,6 @@ static void bch2_btree_set_root(struct btree_update *as,
struct btree *old;
trace_and_count(c, btree_node_set_root, c, b);
BUG_ON(!b->written);
old = btree_node_root(c, b);
@ -1315,8 +1358,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
bch2_btree_update_add_new_node(as, n2);
set1 = btree_bset_first(n1);
set2 = btree_bset_first(n2);
@ -1458,18 +1499,19 @@ static void btree_split_insert_keys(struct btree_update *as,
btree_node_interior_verify(as->c, b);
}
static void btree_split(struct btree_update *as, struct btree_trans *trans,
struct btree_path *path, struct btree *b,
struct keylist *keys, unsigned flags)
static int btree_split(struct btree_update *as, struct btree_trans *trans,
struct btree_path *path, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
struct btree_path *path1 = NULL, *path2 = NULL;
u64 start_time = local_clock();
int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b);
@ -1499,9 +1541,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_path_level_init(trans, path2, n2);
bch2_btree_update_add_new_node(as, n1);
bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
bch2_btree_update_add_new_node(as, n2);
/*
* Note that on recursive parent_keys == keys, so we
@ -1524,9 +1564,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
bch2_btree_update_add_new_node(as, n3);
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
}
} else {
trace_and_count(c, btree_node_compact, c, b);
@ -1541,8 +1581,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
}
@ -1551,7 +1589,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
if (ret)
goto err;
} else if (n3) {
bch2_btree_set_root(as, trans, path, n3);
} else {
@ -1559,11 +1599,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_set_root(as, trans, path, n1);
}
bch2_btree_update_get_open_buckets(as, n1);
if (n2)
bch2_btree_update_get_open_buckets(as, n2);
if (n3)
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
}
if (n2) {
bch2_btree_update_get_open_buckets(as, n2);
bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
}
bch2_btree_update_get_open_buckets(as, n1);
bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
/*
* The old node must be freed (in memory) _before_ unlocking the new
@ -1584,7 +1629,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (n2)
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
out:
if (path2) {
__bch2_btree_path_unlock(trans, path2);
bch2_path_put(trans, path2, true);
@ -1600,6 +1645,14 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact],
start_time);
return ret;
err:
if (n3)
bch2_btree_node_free_never_used(as, trans, n3);
if (n2)
bch2_btree_node_free_never_used(as, trans, n2);
bch2_btree_node_free_never_used(as, trans, n1);
goto out;
}
static void
@ -1634,22 +1687,30 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* If a split occurred, this function will return early. This can only happen
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
struct btree_path *path, struct btree *b,
struct keylist *keys, unsigned flags)
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
struct btree_path *path, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
int ret;
lockdep_assert_held(&c->gc_lock);
BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
BUG_ON(!btree_node_intent_locked(path, b->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
bch2_btree_node_lock_for_insert(trans, path, b);
if (!(local_clock() & 63))
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
ret = bch2_btree_node_lock_write(trans, path, &b->c);
if (ret)
return ret;
bch2_btree_node_prep_for_write(trans, path, b);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
@ -1675,9 +1736,16 @@ static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *
bch2_btree_node_unlock_write(trans, path, b);
btree_node_interior_verify(c, b);
return;
return 0;
split:
btree_split(as, trans, path, b, keys, flags);
/*
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
if (b->c.level >= as->update_level)
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
return btree_split(as, trans, path, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@ -1694,10 +1762,15 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
btree_split(as, trans, path, b, NULL, flags);
ret = btree_split(as, trans, path, b, NULL, flags);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
}
bch2_btree_update_done(as, trans);
for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
@ -1823,8 +1896,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_set_min(n, prev->data->min_key);
btree_set_max(n, next->data->max_key);
bch2_btree_update_add_new_node(as, n);
n->data->format = new_f;
btree_node_set_format(n, new_f);
@ -1834,13 +1905,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->c.lock);
bch2_btree_update_add_new_node(as, n);
new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
bch2_btree_path_level_init(trans, new_path, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
bch2_keylist_add(&as->parent_keys, &delete);
@ -1848,11 +1919,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
if (ret)
goto err_free_update;
bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, path, b);
bch2_btree_node_free_inmem(trans, sib_path, m);
@ -1873,6 +1947,10 @@ err:
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
bch2_btree_update_free(as, trans);
goto out;
}
/**
@ -1913,17 +1991,18 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
trace_and_count(c, btree_node_rewrite, c, b);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
bch2_btree_insert_node(as, trans, iter->path, parent,
&as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, iter->path, parent,
&as->parent_keys, flags);
if (ret)
goto err;
} else {
bch2_btree_set_root(as, trans, iter->path, n);
}
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, iter->path, b);
@ -1931,10 +2010,15 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as, trans);
bch2_path_put(trans, new_path, true);
out:
if (new_path)
bch2_path_put(trans, new_path, true);
bch2_btree_path_downgrade(trans, iter->path);
return ret;
err:
bch2_btree_node_free_never_used(as, trans, n);
bch2_btree_update_free(as, trans);
goto out;
}
struct async_btree_rewrite {

View File

@ -52,6 +52,7 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
unsigned update_level;
struct disk_reservation disk_res;
struct journal_preres journal_preres;

View File

@ -56,9 +56,9 @@ static inline bool same_leaf_as_next(struct btree_trans *trans,
insert_l(&i[0])->b == insert_l(&i[1])->b;
}
static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
{
struct bch_fs *c = trans->c;
@ -77,14 +77,6 @@ static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
{
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_prep_for_write(trans, path, b);
}
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@ -1631,7 +1623,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
while ((k = bch2_btree_iter_peek(&iter)).k) {
while ((k = bch2_btree_iter_peek_upto(&iter, bpos_predecessor(end))).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
@ -1640,9 +1632,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
if (ret)
goto err;
if (bkey_cmp(iter.pos, end) >= 0)
break;
bkey_init(&delete.k);
/*

View File

@ -328,8 +328,9 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.ptr.cached)
m->data_opts.rewrite_ptrs &= ~(1U << i);
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
p.ptr.cached)
BUG();
if (!((1U << i) & m->data_opts.rewrite_ptrs))
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
@ -365,5 +366,23 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
m->op.nr_replicas = m->op.nr_replicas_required =
hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
BUG_ON(!m->op.nr_replicas);
return 0;
}
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
opts->kill_ptrs |= 1U << i;
opts->rewrite_ptrs ^= 1U << i;
}
i++;
}
}

View File

@ -10,6 +10,7 @@ struct moving_context;
struct data_update_opts {
unsigned rewrite_ptrs;
unsigned kill_ptrs;
u16 target;
u8 extra_replicas;
unsigned btree_insert_flags;
@ -34,5 +35,6 @@ int bch2_data_update_init(struct bch_fs *, struct data_update *,
struct write_point_specifier,
struct bch_io_opts, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
#endif /* _BCACHEFS_DATA_UPDATE_H */

View File

@ -501,26 +501,6 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
static int prt_backtrace(struct printbuf *out, struct task_struct *task)
{
unsigned long entries[32];
unsigned i, nr_entries;
int ret;
ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ret;
nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
for (i = 0; i < nr_entries; i++) {
prt_printf(out, "[<0>] %pB", (void *)entries[i]);
prt_newline(out);
}
up_read(&task->signal->exec_update_lock);
return 0;
}
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@ -547,7 +527,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
prt_backtrace(&i->buf, trans->locking_wait.task);
bch2_prt_backtrace(&i->buf, trans->locking_wait.task);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);

View File

@ -1403,10 +1403,8 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
int ret;
idx = get_existing_stripe(c, h);
if (idx < 0) {
bch_err(c, "failed to find an existing stripe");
if (idx < 0)
return -BCH_ERR_ENOSPC_stripe_reuse;
}
h->s->have_existing_stripe = true;
ret = get_stripe_key(c, idx, &h->s->existing_stripe);
@ -1444,21 +1442,9 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
struct ec_stripe_head *h)
{
int ret;
ret = bch2_disk_reservation_get(c, &h->s->res,
h->blocksize,
h->s->nr_parity, 0);
if (ret) {
/*
* This means we need to wait for copygc to
* empty out buckets from existing stripes:
*/
bch_err_ratelimited(c, "failed to reserve stripe: %s", bch2_err_str(ret));
}
return ret;
return bch2_disk_reservation_get(c, &h->s->res,
h->blocksize,
h->s->nr_parity, 0);
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
@ -1500,8 +1486,10 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
ret = __bch2_ec_stripe_head_reserve(c, h);
if (ret && needs_stripe_new)
ret = __bch2_ec_stripe_head_reuse(c, h);
if (ret)
if (ret) {
bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
goto err;
}
if (!h->s->allocated) {
ret = new_stripe_alloc_buckets(c, h, cl);

View File

@ -42,6 +42,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \

View File

@ -2208,6 +2208,9 @@ err:
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
if (ret < 0)
ret = bch2_err_class(ret);
if (!sync) {
req->ki_complete(req, ret);
ret = -EIOCBQUEUED;

View File

@ -191,7 +191,52 @@ void bch_move_stats_init(struct bch_move_stats *stats, char *name)
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
static int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
int ret;
n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
bkey_reassemble(n, k);
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
* (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
if (bkey_deleted(&n->k))
n->k.size = 0;
return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
static int bch2_move_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct bch_io_opts io_opts,
enum btree_id btree_id,
@ -206,6 +251,15 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas) {
if (data_opts.kill_ptrs)
return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
return 0;
}
if (!percpu_ref_tryget_live(&c->writes))
return -EROFS;
@ -447,7 +501,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret2 = bch2_move_extent(&trans, ctxt, io_opts,
ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
btree_id, k, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
@ -544,7 +598,7 @@ again:
prt_str(&buf, "failed to evacuate bucket ");
bch2_bkey_val_to_text(&buf, c, k);
bch2_trans_inconsistent(trans, "%s", buf.buf);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
}
}
@ -599,11 +653,12 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
bch2_trans_iter_exit(&trans, &iter);
ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
if (ret)
if (ret) {
bch2_trans_iter_exit(&trans, &iter);
continue;
}
data_opts = _data_opts;
data_opts.target = io_opts.background_target;
@ -615,8 +670,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
i++;
}
ret = bch2_move_extent(&trans, ctxt, io_opts,
ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
bp.btree_id, k, data_opts);
bch2_trans_iter_exit(&trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {

View File

@ -1325,19 +1325,11 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
/*
* Device going read only means the copygc reserve get smaller, so we
* don't want that happening while copygc is in progress:
*/
bch2_copygc_stop(c);
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
bch2_copygc_start(c);
}
static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)

View File

@ -296,6 +296,26 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
console_unlock();
}
int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
{
unsigned long entries[32];
unsigned i, nr_entries;
int ret;
ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ret;
nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
for (i = 0; i < nr_entries; i++) {
prt_printf(out, "[<0>] %pB", (void *)entries[i]);
prt_newline(out);
}
up_read(&task->signal->exec_update_lock);
return 0;
}
/* time stats: */
static void bch2_time_stats_update_one(struct time_stats *stats,

View File

@ -356,6 +356,7 @@ u64 bch2_read_flag_list(char *, const char * const[]);
void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)

View File

@ -71,8 +71,10 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
p->thread_fn = thread_fn;
p->thread_data = thread_data;
p->state = TASK_UNINTERRUPTIBLE;
p->signal = &p->_signal;
atomic_set(&p->usage, 1);
init_completion(&p->exited);
init_rwsem(&p->_signal.exec_update_lock);
pthread_attr_t attr;
pthread_attr_init(&attr);

View File

@ -2,6 +2,7 @@
#include <stdio.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/shrinker.h>
@ -39,30 +40,29 @@ static u64 parse_meminfo_line(const char *line)
return v << 10;
}
static struct meminfo read_meminfo(void)
void si_meminfo(struct sysinfo *val)
{
struct meminfo ret = { 0 };
size_t len, n = 0;
char *line = NULL;
const char *v;
FILE *f;
memset(val, 0, sizeof(*val));
f = fopen("/proc/meminfo", "r");
if (!f)
return ret;
return;
while ((len = getline(&line, &n, f)) != -1) {
if ((v = strcmp_prefix(line, "MemTotal:")))
ret.total = parse_meminfo_line(v);
val->totalram = parse_meminfo_line(v);
if ((v = strcmp_prefix(line, "MemAvailable:")))
ret.available = parse_meminfo_line(v);
val->freeram = parse_meminfo_line(v);
}
fclose(f);
free(line);
return ret;
}
static void run_shrinkers_allocation_failed(gfp_t gfp_mask)
@ -85,7 +85,7 @@ static void run_shrinkers_allocation_failed(gfp_t gfp_mask)
void run_shrinkers(gfp_t gfp_mask, bool allocation_failed)
{
struct shrinker *shrinker;
struct meminfo info;
struct sysinfo info;
s64 want_shrink;
/* Fast out if there are no shrinkers to run. */
@ -97,10 +97,10 @@ void run_shrinkers(gfp_t gfp_mask, bool allocation_failed)
return;
}
info = read_meminfo();
si_meminfo(&info);
if (info.total && info.available) {
want_shrink = (info.total >> 2) - info.available;
if (info.totalram && info.freeram) {
want_shrink = (info.totalram >> 2) - info.freeram;
if (want_shrink <= 0)
return;