Skip to content

Commit

Permalink
bcachefs: Update btree ptrs after every write
Browse files Browse the repository at this point in the history
This closes a significant hole (and last known hole) in our ability to
verify metadata. Previously, since btree nodes are log structured, we
couldn't detect lost btree writes that weren't the first write to a
given node. Additionally, this seems to have lead to some significant
metadata corruption on multi device filesystems with metadata
replication: since a write may have made it to one device and not
another, if we read that btree node back from the replica that did have
that write and started appending after that point, the other replica
would have a gap in the bset entries and reading from that replica
wouldn't find the rest of the bsets.

But, since updates to interior btree nodes are now journalled, we can
close this hole by updating pointers to btree nodes after every write
with the currently written number of sectors, without negatively
affecting performance. This means we will always detect lost or corrupt
metadata - it also means that our btree is now a curious hybrid of COW
and non COW btrees, with all the benefits of both (excluding
complexity).

Signed-off-by: Kent Overstreet <[email protected]>
  • Loading branch information
koverstreet committed Jul 15, 2021
1 parent 2f3f216 commit 15178a6
Show file tree
Hide file tree
Showing 13 changed files with 250 additions and 214 deletions.
4 changes: 1 addition & 3 deletions fs/bcachefs/bcachefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ struct bch_fs {
struct btree_key_cache btree_key_cache;

struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_error_wq;
struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;

Expand Down Expand Up @@ -826,8 +826,6 @@ struct bch_fs {

atomic64_t btree_writes_nr;
atomic64_t btree_writes_sectors;
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;

/* ERRORS */
Expand Down
3 changes: 2 additions & 1 deletion fs/bcachefs/bcachefs_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -1210,7 +1210,8 @@ enum bcachefs_metadata_version {
bcachefs_metadata_version_inode_btree_change = 11,
bcachefs_metadata_version_snapshot = 12,
bcachefs_metadata_version_inode_backpointers = 13,
bcachefs_metadata_version_max = 14,
bcachefs_metadata_version_btree_ptr_sectors_written = 14,
bcachefs_metadata_version_max = 15,
};

#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
Expand Down
222 changes: 94 additions & 128 deletions fs/bcachefs/btree_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));

clear_btree_node_write_in_flight_inner(b);
clear_btree_node_write_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
Expand Down Expand Up @@ -870,7 +871,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
unsigned nonblacklisted_written = 0;
unsigned blacklisted_written, nonblacklisted_written = 0;
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
int ret, retry_read = 0, write = READ;

b->version_ondisk = U16_MAX;
Expand Down Expand Up @@ -901,7 +903,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
b->data->keys.seq, bp->seq);
}

while (b->written < c->opts.btree_node_size) {
while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
Expand Down Expand Up @@ -981,6 +983,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(blacklisted && first,
BTREE_ERR_FIXABLE, c, ca, b, i,
"first btree node bset has blacklisted journal seq");

btree_err_on(blacklisted && ptr_written,
BTREE_ERR_FIXABLE, c, ca, b, i,
"found blacklisted bset in btree node with sectors_written");
if (blacklisted && !first)
continue;

Expand All @@ -994,26 +1000,34 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonblacklisted_written = b->written;
}

for (bne = write_block(b);
bset_byte_offset(b, bne) < btree_bytes(c);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq),
true),
if (ptr_written) {
btree_err_on(b->written < ptr_written,
BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
"found bset signature after last bset");
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
} else {
for (bne = write_block(b);
bset_byte_offset(b, bne) < btree_bytes(c);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq),
true),
BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
"found bset signature after last bset");

/*
* Blacklisted bsets are those that were written after the most recent
* (flush) journal write. Since there wasn't a flush, they may not have
* made it to all devices - which means we shouldn't write new bsets
* after them, as that could leave a gap and then reads from that device
* wouldn't find all the bsets in that btree node - which means it's
* important that we start writing new bsets after the most recent _non_
* blacklisted bset:
*/
b->written = nonblacklisted_written;
/*
* Blacklisted bsets are those that were written after the most recent
* (flush) journal write. Since there wasn't a flush, they may not have
* made it to all devices - which means we shouldn't write new bsets
* after them, as that could leave a gap and then reads from that device
* wouldn't find all the bsets in that btree node - which means it's
* important that we start writing new bsets after the most recent _non_
* blacklisted bset:
*/
blacklisted_written = b->written;
b->written = nonblacklisted_written;
}

sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
sorted->keys.u64s = 0;
Expand Down Expand Up @@ -1081,6 +1095,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (ca->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}

if (!ptr_written)
set_btree_node_need_rewrite(b);
out:
mempool_free(iter, &c->fill_iter);
return retry_read;
Expand Down Expand Up @@ -1578,6 +1595,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
goto do_write;

new &= ~(1U << BTREE_NODE_write_in_flight);
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
} while ((v = cmpxchg(&b->flags, old, new)) != old);

wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
Expand All @@ -1596,10 +1614,12 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
new &= ~(1U << BTREE_NODE_dirty);
new &= ~(1U << BTREE_NODE_need_write);
new |= (1U << BTREE_NODE_write_in_flight);
new |= (1U << BTREE_NODE_write_in_flight_inner);
new |= (1U << BTREE_NODE_just_written);
new ^= (1U << BTREE_NODE_write_idx);
} else {
new &= ~(1U << BTREE_NODE_write_in_flight);
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
}
} while ((v = cmpxchg(&b->flags, old, new)) != old);

Expand All @@ -1609,52 +1629,38 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
six_unlock_read(&b->c.lock);
}

static void bch2_btree_node_write_error(struct bch_fs *c,
struct btree_write_bio *wbio)
static void btree_node_write_work(struct work_struct *work)
{
struct btree_write_bio *wbio =
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
struct bkey_buf k;
struct bch_extent_ptr *ptr;
struct btree_trans trans;
struct btree_iter *iter;
int ret;

bch2_bkey_buf_init(&k);
bch2_trans_init(&trans, c, 0, 0);

iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level, 0);
retry:
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;

/* has node been freed? */
if (iter->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
}

BUG_ON(!btree_node_hashed(b));

bch2_bkey_buf_copy(&k, c, &b->key);
btree_bounce_free(c,
wbio->data_bytes,
wbio->wbio.used_mempool,
wbio->data);

bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));

if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
goto err;

ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
if (ret == -EINTR)
goto retry;
if (ret)
goto err;
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {

}
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
!wbio->wbio.failed.nr));
if (ret)
goto err;
}
out:
bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&k, c);
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
return;
Expand All @@ -1664,58 +1670,14 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
goto out;
}

void bch2_btree_write_error_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs,
btree_write_error_work);
struct bio *bio;

while (1) {
spin_lock_irq(&c->btree_write_error_lock);
bio = bio_list_pop(&c->btree_write_error_list);
spin_unlock_irq(&c->btree_write_error_lock);

if (!bio)
break;

bch2_btree_node_write_error(c,
container_of(bio, struct btree_write_bio, wbio.bio));
}
}

static void btree_node_write_work(struct work_struct *work)
{
struct btree_write_bio *wbio =
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;

btree_bounce_free(c,
wbio->bytes,
wbio->wbio.used_mempool,
wbio->data);

if (wbio->wbio.failed.nr) {
unsigned long flags;

spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);

queue_work(c->btree_error_wq, &c->btree_write_error_work);
return;
}

bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
}

static void btree_node_write_endio(struct bio *bio)
{
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;

Expand All @@ -1736,13 +1698,13 @@ static void btree_node_write_endio(struct bio *bio)
if (parent) {
bio_put(bio);
bio_endio(&parent->bio);
} else {
struct btree_write_bio *wb =
container_of(orig, struct btree_write_bio, wbio);

INIT_WORK(&wb->work, btree_node_write_work);
queue_work(c->io_complete_wq, &wb->work);
return;
}

clear_btree_node_write_in_flight_inner(b);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
INIT_WORK(&wb->work, btree_node_write_work);
queue_work(c->btree_io_complete_wq, &wb->work);
}

static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
Expand All @@ -1767,8 +1729,15 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
struct bch_extent_ptr *ptr;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;

bkey_copy(&tmp.k, &wbio->key);

bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
ptr->offset += wbio->sector_offset;

bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
}

void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
Expand All @@ -1778,7 +1747,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
struct bch_extent_ptr *ptr;
struct sort_iter sort_iter;
struct nonce nonce;
unsigned bytes_to_write, sectors_to_write, bytes, u64s;
Expand Down Expand Up @@ -1818,6 +1786,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
new &= ~(1 << BTREE_NODE_dirty);
new &= ~(1 << BTREE_NODE_need_write);
new |= (1 << BTREE_NODE_write_in_flight);
new |= (1 << BTREE_NODE_write_in_flight_inner);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
Expand Down Expand Up @@ -1969,37 +1938,30 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
struct btree_write_bio, wbio.bio);
wbio_init(&wbio->wbio.bio);
wbio->data = data;
wbio->bytes = bytes;
wbio->data_bytes = bytes;
wbio->sector_offset = b->written;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;

bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);

/*
* If we're appending to a leaf node, we don't technically need FUA -
* this write just needs to be persisted before the next journal write,
* which will be marked FLUSH|FUA.
*
* Similarly if we're writing a new btree root - the pointer is going to
* be in the next journal entry.
*
* But if we're writing a new btree node (that isn't a root) or
* appending to a non leaf btree node, we need either FUA or a flush
* when we write the parent with the new pointer. FUA is cheaper than a
* flush, and writes appending to leaf nodes aren't blocking anything so
* just make all btree node writes FUA to keep things sane.
*/

bkey_copy(&wbio->key, &b->key);

bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
ptr->offset += b->written;

b->written += sectors_to_write;

if (wbio->wbio.first_btree_write &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
cpu_to_le16(b->written);

if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);

atomic64_inc(&c->btree_writes_nr);
atomic64_add(sectors_to_write, &c->btree_writes_sectors);

Expand All @@ -2008,6 +1970,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
return;
err:
set_btree_node_noevict(b);
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
cpu_to_le16(sectors_to_write);
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
Expand Down
Loading

0 comments on commit 15178a6

Please sign in to comment.