diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 50b5fc1c06d7..ea7f7a17a3d5 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); return bbio; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index dc2eb43b7097..00883aea55d7 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -82,6 +82,8 @@ struct btrfs_bio { /* Save the first error status of split bio. */ blk_status_t status; + /* Use the commit root to look up csums (data read bio only). */ + bool csum_search_commit_root; /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 35e3071cec06..06e119ee2649 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -602,6 +602,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->compressed_len = compressed_len; cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; + cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; btrfs_free_extent_map(em); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b21cb72835cc..7ab45b234621 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,26 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; + /* + * For data read bios, we attempt to optimize csum lookups if the extent + * generation is older than the current one. To make this possible, we + * need to track the maximum generation of an extent in a bio_ctrl to + * make the decision when submitting the bio. + * + * The pattern between do_readpage(), submit_one_bio() and + * submit_extent_folio() is quite subtle, so tracking this is tricky. + * + * As we process extent E, we might submit a bio with existing built up + * extents before adding E to a new bio, or we might just add E to the + * bio. As a result, E's generation could apply to the current bio or + * to the next one, so we need to be careful to update the bio_ctrl's + * generation with E's only when we are sure E is added to bio_ctrl->bbio + * in submit_extent_folio(). + * + * See the comment in btrfs_lookup_bio_sums() for more detail on the + * need for this optimization. + */ + u64 generation; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; @@ -131,6 +151,26 @@ struct btrfs_bio_ctrl { u64 last_em_start; }; +/* + * Helper to set the csum search commit root option for a bio_ctrl's bbio + * before submitting the bio. + * + * Only for use by submit_one_bio(). + */ +static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + ASSERT(bbio); + + if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) + return; + + bio_ctrl->bbio->csum_search_commit_root = + (bio_ctrl->generation && + bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); +} + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; @@ -141,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); + bio_set_csum_search_commit_root(bio_ctrl); + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -149,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; + /* + * We used the generation to decide whether to lookup csums in the + * commit_root or not when we called bio_set_csum_search_commit_root() + * above. Now, reset the generation for the next bio. + */ + bio_ctrl->generation = 0; } /* @@ -719,6 +767,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one + * @read_em_generation: generation of the extent_map we are submitting + * (only used for read) * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. @@ -727,7 +777,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct folio *folio, - size_t size, unsigned long pg_offset) + size_t size, unsigned long pg_offset, + u64 read_em_generation) { struct btrfs_inode *inode = folio_to_inode(folio); loff_t file_offset = folio_pos(folio) + pg_offset; @@ -758,6 +809,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + /* + * Now that the folio is definitely added to the bio, include its + * generation in the max generation calculation. + */ + bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) @@ -960,6 +1016,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; + u64 em_gen; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { @@ -1043,6 +1100,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bio_ctrl->last_em_start = em->start; + em_gen = em->generation; btrfs_free_extent_map(em); em = NULL; @@ -1066,7 +1124,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, - pg_offset); + pg_offset, em_gen); } return 0; } @@ -1600,7 +1658,7 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(folio_test_writeback(folio)); submit_extent_folio(bio_ctrl, disk_bytenr, folio, - sectorsize, filepos - folio_pos(folio)); + sectorsize, filepos - folio_pos(folio), 0); return 0; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c09fbc257634..4dd3d8a02519 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } + /* + * If we are searching for a csum of an extent from a past + * transaction, we can search in the commit root and reduce + * lock contention on the csum tree extent buffers. + * + * This is important because that lock is an rwsem which gets + * pretty heavy write load under memory pressure and sustained + * csum overwrites, unlike the commit_root_sem. (Memory pressure + * makes us writeback the nodes multiple times per transaction, + * which makes us cow them each time, taking the write lock.) + * + * Due to how rwsem is implemented, there is a possible + * priority inversion where the readers holding the lock don't + * get scheduled (say they're in a cgroup stuck in heavy reclaim) + * which then blocks writers, including transaction commit. By + * using a semaphore with fewer writers (only a commit switching + * the roots), we make this issue less likely. + * + * Note that we don't rely on btrfs_search_slot to lock the + * commit root csum. We call search_slot multiple times, which would + * create a potential race where a commit comes in between searches + * while we are not holding the commit_root_sem, and we get csums + * from across transactions. + */ + if (bbio->csum_search_commit_root) { + path->search_commit_root = 1; + path->skip_locking = 1; + down_read(&fs_info->commit_root_sem); + } + while (bio_offset < orig_len) { int count; u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; @@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } + if (bbio->csum_search_commit_root) + up_read(&fs_info->commit_root_sem); return ret; }