245 lines
7.4 KiB
Diff
245 lines
7.4 KiB
Diff
From 488249b3f6257c8db748bcb27efad901481060e5 Mon Sep 17 00:00:00 2001
|
|
From: Kent Overstreet <kent.overstreet@linux.dev>
|
|
Date: Fri, 6 Dec 2024 19:23:22 -0500
|
|
Subject: [PATCH 180/233] bcachefs: Use a heap for handling overwrites in btree
|
|
node scan
|
|
Content-Type: text/plain; charset="utf-8"
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Fix an O(n^2) issue when we find many overlapping (overwritten) btree
|
|
nodes - especially when one node overwrites many smaller nodes.
|
|
|
|
This was discovered to be an issue with the bcachefs
|
|
merge_torture_flakey test - if we had a large btree that was then
|
|
emptied, the number of difficult overwrites can be unbounded.
|
|
|
|
Cc: Kuan-Wei Chiu <visitorckw@gmail.com>
|
|
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
|
|
Signed-off-by: Alexander Miroshnichenko <alex@millerson.name>
|
|
---
|
|
fs/bcachefs/btree_node_scan.c | 133 ++++++++++++++++++----------
|
|
fs/bcachefs/btree_node_scan_types.h | 1 -
|
|
2 files changed, 86 insertions(+), 48 deletions(-)
|
|
|
|
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
|
|
index eeafb5e7354e..a7f06deee13c 100644
|
|
--- a/fs/bcachefs/btree_node_scan.c
|
|
+++ b/fs/bcachefs/btree_node_scan.c
|
|
@@ -12,6 +12,7 @@
|
|
#include "recovery_passes.h"
|
|
|
|
#include <linux/kthread.h>
|
|
+#include <linux/min_heap.h>
|
|
#include <linux/sort.h>
|
|
|
|
struct find_btree_nodes_worker {
|
|
@@ -31,8 +32,6 @@ static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, con
|
|
|
|
if (n->range_updated)
|
|
prt_str(out, " range updated");
|
|
- if (n->overwritten)
|
|
- prt_str(out, " overwritten");
|
|
|
|
for (unsigned i = 0; i < n->nr_ptrs; i++) {
|
|
prt_char(out, ' ');
|
|
@@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r)
|
|
-found_btree_node_cmp_time(l, r);
|
|
}
|
|
|
|
+static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
|
|
+{
|
|
+ return found_btree_node_cmp_pos(l, r) < 0;
|
|
+}
|
|
+
|
|
+static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
|
|
+{
|
|
+ struct found_btree_node *l = _l;
|
|
+ struct found_btree_node *r = _r;
|
|
+
|
|
+ swap(*l, *r);
|
|
+}
|
|
+
|
|
+static const struct min_heap_callbacks found_btree_node_heap_cbs = {
|
|
+ .less = found_btree_node_cmp_pos_less,
|
|
+ .swp = found_btree_node_swap,
|
|
+};
|
|
+
|
|
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
|
struct bio *bio, struct btree_node *bn, u64 offset)
|
|
{
|
|
@@ -295,55 +312,48 @@ static int read_btree_nodes(struct find_btree_nodes *f)
|
|
return f->ret ?: ret;
|
|
}
|
|
|
|
-static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
|
|
+static bool nodes_overlap(const struct found_btree_node *l,
|
|
+ const struct found_btree_node *r)
|
|
{
|
|
- while (n + 1 < end &&
|
|
- found_btree_node_cmp_pos(n, n + 1) > 0) {
|
|
- swap(n[0], n[1]);
|
|
- n++;
|
|
- }
|
|
+ return (l->btree_id == r->btree_id &&
|
|
+ l->level == r->level &&
|
|
+ bpos_gt(l->max_key, r->min_key));
|
|
}
|
|
|
|
static int handle_overwrites(struct bch_fs *c,
|
|
- struct found_btree_node *start,
|
|
- struct found_btree_node *end)
|
|
+ struct found_btree_node *l,
|
|
+ found_btree_nodes *nodes_heap)
|
|
{
|
|
- struct found_btree_node *n;
|
|
-again:
|
|
- for (n = start + 1;
|
|
- n < end &&
|
|
- n->btree_id == start->btree_id &&
|
|
- n->level == start->level &&
|
|
- bpos_lt(n->min_key, start->max_key);
|
|
- n++) {
|
|
- int cmp = found_btree_node_cmp_time(start, n);
|
|
+ struct found_btree_node *r;
|
|
+
|
|
+ while ((r = min_heap_peek(nodes_heap)) &&
|
|
+ nodes_overlap(l, r)) {
|
|
+ int cmp = found_btree_node_cmp_time(l, r);
|
|
|
|
if (cmp > 0) {
|
|
- if (bpos_cmp(start->max_key, n->max_key) >= 0)
|
|
- n->overwritten = true;
|
|
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
|
|
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
|
|
else {
|
|
- n->range_updated = true;
|
|
- n->min_key = bpos_successor(start->max_key);
|
|
- n->range_updated = true;
|
|
- bubble_up(n, end);
|
|
- goto again;
|
|
+ r->range_updated = true;
|
|
+ r->min_key = bpos_successor(l->max_key);
|
|
+ r->range_updated = true;
|
|
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
|
|
}
|
|
} else if (cmp < 0) {
|
|
- BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
|
|
+ BUG_ON(bpos_eq(l->min_key, r->min_key));
|
|
|
|
- start->max_key = bpos_predecessor(n->min_key);
|
|
- start->range_updated = true;
|
|
- } else if (n->level) {
|
|
- n->overwritten = true;
|
|
+ l->max_key = bpos_predecessor(r->min_key);
|
|
+ l->range_updated = true;
|
|
+ } else if (r->level) {
|
|
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
|
|
} else {
|
|
- if (bpos_cmp(start->max_key, n->max_key) >= 0)
|
|
- n->overwritten = true;
|
|
+ if (bpos_cmp(l->max_key, r->max_key) >= 0)
|
|
+ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
|
|
else {
|
|
- n->range_updated = true;
|
|
- n->min_key = bpos_successor(start->max_key);
|
|
- n->range_updated = true;
|
|
- bubble_up(n, end);
|
|
- goto again;
|
|
+ r->range_updated = true;
|
|
+ r->min_key = bpos_successor(l->max_key);
|
|
+ r->range_updated = true;
|
|
+ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
|
|
}
|
|
}
|
|
}
|
|
@@ -355,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
|
{
|
|
struct find_btree_nodes *f = &c->found_btree_nodes;
|
|
struct printbuf buf = PRINTBUF;
|
|
+ found_btree_nodes nodes_heap = {};
|
|
size_t dst;
|
|
int ret = 0;
|
|
|
|
@@ -409,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
|
bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
|
}
|
|
|
|
- dst = 0;
|
|
- darray_for_each(f->nodes, i) {
|
|
- if (i->overwritten)
|
|
- continue;
|
|
+ swap(nodes_heap, f->nodes);
|
|
+
|
|
+ {
|
|
+ /* darray must have same layout as a heap */
|
|
+ min_heap_char real_heap;
|
|
+ BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
|
|
+ BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
|
|
+ BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
|
|
+ BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
|
|
+ }
|
|
|
|
- ret = handle_overwrites(c, i, &darray_top(f->nodes));
|
|
+ min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
|
|
+
|
|
+ if (nodes_heap.nr) {
|
|
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
|
|
if (ret)
|
|
goto err;
|
|
|
|
- BUG_ON(i->overwritten);
|
|
- f->nodes.data[dst++] = *i;
|
|
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
|
|
}
|
|
- f->nodes.nr = dst;
|
|
|
|
- if (c->opts.verbose) {
|
|
+ while (true) {
|
|
+ ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!nodes_heap.nr)
|
|
+ break;
|
|
+
|
|
+ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
|
|
+ }
|
|
+
|
|
+ for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
|
|
+ BUG_ON(nodes_overlap(n, n + 1));
|
|
+
|
|
+ if (0 && c->opts.verbose) {
|
|
printbuf_reset(&buf);
|
|
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
|
|
found_btree_nodes_to_text(&buf, c, f->nodes);
|
|
bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
|
+ } else {
|
|
+ bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
|
|
}
|
|
|
|
eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
|
|
err:
|
|
+ darray_exit(&nodes_heap);
|
|
printbuf_exit(&buf);
|
|
return ret;
|
|
}
|
|
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
|
|
index b6c36c45d0be..2811b6857c97 100644
|
|
--- a/fs/bcachefs/btree_node_scan_types.h
|
|
+++ b/fs/bcachefs/btree_node_scan_types.h
|
|
@@ -6,7 +6,6 @@
|
|
|
|
struct found_btree_node {
|
|
bool range_updated:1;
|
|
- bool overwritten:1;
|
|
u8 btree_id;
|
|
u8 level;
|
|
unsigned sectors_written;
|
|
--
|
|
2.45.2
|
|
|