diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad66b..460eced3f5bd01 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 708d843daa72de..ede9251e489beb 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include "misc.h" #include "ctree.h" @@ -539,6 +540,134 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end return total_added; } +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + int ret = 0; + u64 search_offset; + struct btrfs_path *path; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + down_read(&fs_info->commit_root_sem); + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + key->objectid = block_group->start + search_offset; + key->type = BTRFS_EXTENT_ITEM_KEY; + key->offset = 0; + + ret = btrfs_search_slot(NULL, extent_root, key, path, 0, 0); + if (ret != 0) + goto out; + if (key->objectid < block_group->start || + key->objectid > block_group->start + block_group->length) { + ret = 1; + goto out; + } + if (key->type != BTRFS_EXTENT_ITEM_KEY) { + ret = 1; + goto out; + } +out: + btrfs_free_path(path); + up_read(&fs_info->commit_root_sem); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_block_group *block_group) +{ + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (btrfs_block_group_should_use_size_class(block_group)) + return 0; + + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } + +out: + return ret; +} + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; @@ -738,6 +867,8 @@ static noinline void caching_thread(struct btrfs_work *work) wake_up(&caching_ctl->wait); + load_block_group_size_class(block_group); + btrfs_put_caching_control(caching_ctl); btrfs_put_block_group(block_group); } @@ -959,6 +1090,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&trans->transaction->dirty_bgs_lock); mutex_unlock(&trans->transaction->cache_write_mutex); + spin_lock(&fs_info->zone_alloc_list_lock); + if (!list_empty(&block_group->zoned_alloc_list)) + list_del_init(&block_group->zoned_alloc_list); + spin_unlock(&fs_info->zone_alloc_list_lock); + ret = btrfs_remove_free_space_inode(trans, inode, block_group); if (ret) goto out; @@ -1699,6 +1835,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) next: btrfs_put_block_group(bg); + + /* let other operations to go */ + mutex_unlock(&fs_info->reclaim_bgs_lock); + sb_end_write(fs_info->sb); + btrfs_exclop_finish(fs_info); + + cond_resched(); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + sb_start_write(fs_info->sb); + mutex_lock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); @@ -1992,6 +2140,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); INIT_LIST_HEAD(&cache->active_bg_list); + INIT_LIST_HEAD(&cache->zoned_alloc_list); btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); @@ -3330,7 +3479,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { - bool reclaim; + bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { @@ -3379,6 +3528,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3433,32 +3583,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4218,3 +4378,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cffe..264f7d969bb220 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,8 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; + struct list_head zoned_alloc_list; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +315,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -346,4 +360,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea4e..4a5aeb8dd4793a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1609,7 +1609,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, index_end = end >> PAGE_SHIFT; /* Don't miss unaligned end */ - if (!IS_ALIGNED(end, PAGE_SIZE)) + if (!PAGE_ALIGNED(end)) index_end++; curr_sample_pos = 0; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d81b764a76446b..8065341d831a17 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i break; unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } #define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 573ebab886e231..886ffb232eac2a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); } -static bool merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - drop_delayed_ref(trans, delayed_refs, head, next); + drop_delayed_ref(delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); + drop_delayed_ref(delayed_refs, head, ref); done = true; } else { /* @@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, return done; } -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -524,7 +521,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(trans, delayed_refs, head, ref, seq)) + if (merge_ref(delayed_refs, head, ref, seq)) goto again; } } @@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return 0 for insert. * Return >0 for merge. */ -static int insert_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, +static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { @@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(trans, root, href, exist); + drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); return ret; inserted: @@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d6304b690ec4a1..2eb34abf700ff4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index ff2e524d993773..6931616fea4749 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -89,6 +89,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } + if (list_empty(&block_group->discard_list)) + btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); @@ -108,8 +110,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + bool queued; + spin_lock(&discard_ctl->lock); + queued = !list_empty(&block_group->discard_list); + if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; @@ -121,6 +127,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + if (!queued) + btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); @@ -131,6 +139,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; + bool queued = false; spin_lock(&discard_ctl->lock); @@ -140,7 +149,10 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, } block_group->discard_eligible_time = 0; + queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); + if (queued && !running) + btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -214,10 +226,12 @@ static struct btrfs_block_group *peek_discard_list( if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { - if (btrfs_is_block_group_data_only(block_group)) + if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); - else + } else { list_del_init(&block_group->discard_list); + btrfs_put_block_group(block_group); + } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { @@ -511,6 +525,8 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; + if (list_empty(&block_group->discard_list)) + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); @@ -651,8 +667,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + /* + * This put is for the get done by btrfs_mark_bg_unused. + * Queueing discard incremented it for discard's reference. + */ + btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -683,6 +703,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8aeaada1fcaec2..20d7780cbc1975 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -367,7 +367,14 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) btrfs_print_tree(eb, 0); btrfs_err(fs_info, "block=%llu write time tree block corruption detected", eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + /* + * Be noisy if this is an extent buffer from a log tree. We don't abort + * a transaction in case there's a bad log tree extent buffer, we just + * fallback to a transaction commit. Still we want to know when there is + * a bad log tree extent buffer, as that may signal a bug somewhere. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || + btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); return ret; } @@ -710,6 +717,10 @@ static void run_one_async_start(struct btrfs_work *work) ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, async->dio_file_offset); break; + default: + /* Can't happen so return something that would prevent the IO. */ + ret = BLK_STS_IOERR; + ASSERT(0); } if (ret) async->status = ret; @@ -3051,6 +3062,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { + int i; + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -3098,6 +3111,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); INIT_LIST_HEAD(&fs_info->zone_active_bgs); + for (i = 0; i < BTRFS_NUM_ZONE_ALLOC_LIST; i++) + INIT_LIST_HEAD(&fs_info->zone_alloc_list[i]); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 3c7766dfaa694a..b2bab76723404b 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9e6..b0636a7b87c77b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,7 +16,8 @@ #include #include #include -#include "misc.h" +#include "ctree.h" +#include "extent-tree.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -31,7 +32,6 @@ #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" -#include "block-group.h" #include "discard.h" #include "rcu-string.h" #include "zoned.h" @@ -1966,7 +1966,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -2013,7 +2013,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &actual_count); @@ -3385,7 +3385,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3453,81 +3455,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - - /* * Helper function for find_free_extent(). * @@ -3559,8 +3486,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,10 +3536,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && @@ -3732,6 +3656,44 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, * fs_info::treelog_bg_lock */ +static void dump_zoned_block_groups(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_device *device; + int n = 0; + + btrfs_info(fs_info, "Dump active block groups"); + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + spin_lock(&block_group->lock); + btrfs_info(fs_info, + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", + block_group->start, block_group->length, block_group->used, block_group->pinned, + block_group->reserved, block_group->zone_unusable, + block_group->ro ? "[readonly]" : ""); + btrfs_info(fs_info, "free space %llu active %d flags %llu", + block_group->zone_capacity - block_group->alloc_offset, + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags), + block_group->flags); + spin_unlock(&block_group->lock); + n += block_group->physical_map->num_stripes; + } + spin_unlock(&fs_info->zone_active_bgs_lock); + btrfs_info(fs_info, "nactive in list = %d", n); + + list_for_each_entry (device, &fs_info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + if (!device->zone_info) + continue; + btrfs_info(fs_info, " dev active zones %d bitmap %d", + atomic_read(&device->zone_info->active_zones_left), + bitmap_weight(device->zone_info->active_zones, + device->zone_info->nr_zones)); + } +} + /* * Simple allocator for sequential-only block group. It only allows sequential * allocation. No need to play with trees. This function also reserves the @@ -3752,9 +3714,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, u64 data_reloc_bytenr; int ret = 0; bool skip = false; + int cur_bucket, new_bucket; ASSERT(btrfs_is_zoned(block_group->fs_info)); + if (block_group->alloc_offset == block_group->zone_capacity) { + if (ffe_ctl->for_treelog) + btrfs_clear_treelog_bg(block_group); + if (ffe_ctl->for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); + return 1; + } + /* * Do not allow non-tree-log blocks in the dedicated tree-log block * group, and vice versa. @@ -3799,6 +3770,12 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, * May need to clear fs_info->{treelog,data_reloc}_bg. * Return the error after taking the locks. */ + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + pr_info("dupm max_extent_size %llu min_alloc_size %llu num_bytes %llu\n", + ffe_ctl->max_extent_size, ffe_ctl->min_alloc_size, + ffe_ctl->num_bytes); + dump_zoned_block_groups(fs_info); + } } spin_lock(&space_info->lock); @@ -3869,6 +3846,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, ctl->free_space -= num_bytes; spin_unlock(&ctl->tree_lock); + cur_bucket = btrfs_zoned_alloc_bucket(fs_info, avail); + new_bucket = btrfs_zoned_alloc_bucket(fs_info, avail - num_bytes); + if (cur_bucket != new_bucket) { + spin_lock(&fs_info->zone_alloc_list_lock); + if (new_bucket == -1) + list_del_init(&block_group->zoned_alloc_list); + else + list_move_tail(&block_group->zoned_alloc_list, + &fs_info->zone_alloc_list[new_bucket]); + spin_unlock(&fs_info->zone_alloc_list_lock); + } + /* * We do not check if found_offset is aligned to stripesize. The * address is anyway rewritten when using zone append writing. @@ -4028,24 +4017,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4079,31 +4050,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4123,8 +4091,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4154,6 +4124,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4220,12 +4205,29 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, if (fs_info->treelog_bg) ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { + } else if (ffe_ctl->for_data_reloc) { spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + int bucket = btrfs_zoned_alloc_bucket(fs_info, ffe_ctl->num_bytes); + int i; + + spin_lock(&fs_info->zone_alloc_list_lock); + for (i = bucket; i < BTRFS_NUM_ZONE_ALLOC_LIST; i++) { + list_for_each_entry(block_group, &fs_info->zone_alloc_list[i], + zoned_alloc_list) { + if (!block_group_bits(block_group, ffe_ctl->flags)) + continue; + + ffe_ctl->hint_byte = block_group->start; + spin_unlock(&fs_info->zone_alloc_list_lock); + return 0; + } + } + spin_unlock(&fs_info->zone_alloc_list_lock); } return 0; default: @@ -4288,6 +4290,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,8 +4299,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { @@ -4340,6 +4342,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4350,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4360,7 @@ static noinline int find_free_extent(struct btrfs_root *root, &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4402,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4421,6 +4427,9 @@ static noinline int find_free_extent(struct btrfs_root *root, if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4455,7 +4464,8 @@ static noinline int find_free_extent(struct btrfs_root *root, ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,8 +4478,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ae54252536031c..0c958fc1b3b8a1 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,6 +3,87 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 834bbcb91102fd..5cc5a1faaef5b5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -3541,6 +3541,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *extent; u64 extent_end; + u8 type; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -3596,10 +3597,16 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, extent); - if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 || - btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_PREALLOC) { + /* + * Can't access the extent's disk_bytenr field if this is an + * inline extent, since at that offset, it's where the extent + * data starts. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC || + (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { /* * Explicit hole or prealloc extent, search for delalloc. * A prealloc extent is treated like a hole. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a749367e5ae2a2..bf48b182b4368c 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -119,6 +119,12 @@ enum { /* Indicate that we want to commit the transaction. */ BTRFS_FS_NEED_TRANS_COMMIT, + /* + * Indicate metadata over-commit is disabled. This is set when active + * zone tracking is needed. + */ + BTRFS_FS_NO_OVERCOMMIT, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -343,6 +349,7 @@ struct btrfs_commit_stats { u64 total_commit_dur; }; +#define BTRFS_NUM_ZONE_ALLOC_LIST 8 struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -755,6 +762,9 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + spinlock_t zone_alloc_list_lock; + struct list_head zone_alloc_list[BTRFS_NUM_ZONE_ALLOC_LIST]; + /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a800b8bd438b..3c49742f0d4556 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -228,7 +228,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start, page_end; + u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { @@ -2969,7 +2969,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -4987,7 +4987,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -7392,7 +7392,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -8080,7 +8080,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, if (IS_ERR(em)) { status = errno_to_blk_status(PTR_ERR(em)); em = NULL; - goto out_err_em; + goto out_err; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), logical, &geom); @@ -8552,7 +8552,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -10995,9 +10995,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; - first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, - PAGE_SIZE) >> PAGE_SHIFT; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d5e78cbc8fbc77..71f6d8302d50e2 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* Check if we have reached page boundary */ - if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + if (PAGE_ALIGNED(cur_in)) { put_page(page_in); page_in = NULL; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 625bbbbb2608d8..fde5aaa6e7c956 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) } #endif -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} - /* * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an * alert, and either panics or BUGs, depending on mount options. diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 190af1f698d9ab..8c516ee58ff95b 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -6,7 +6,6 @@ #include struct btrfs_fs_info; -struct btrfs_trans_handle; static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char * __attribute_const__ btrfs_decode_error(int errno); -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_ERR \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_err((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 57d8c72737e1ad..6c24b69e2d0a37 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); complete(&ordered->completion); } @@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* - * Used to start IO or wait for a given ordered extent to finish. + * Start IO and wait for a given ordered extent to finish. * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); - if (wait) { - if (!freespace_inode) - btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } /* @@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 89f82b78f590f3..ae3ed748acaf66 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -187,7 +187,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d275bf24b250bd..af97413abcf43b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2765,9 +2765,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) /* * Old roots should be searched when inserting qgroup - * extent record + * extent record. + * + * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, + * we may have some record inserted during + * NO_ACCOUNTING (thus no old_roots populated), but + * later we start rescan, which clears NO_ACCOUNTING, + * leaving some inserted records without old_roots + * populated. + * + * Those cases are rare and should not cause too much + * time spent during commit_transaction(). */ - if (WARN_ON(!record->old_roots)) { + if (!record->old_roots) { /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) @@ -3357,6 +3367,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; bool stopped = false; + bool did_leaf_rescans = false; path = btrfs_alloc_path(); if (!path) @@ -3377,6 +3388,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) } err = qgroup_rescan_leaf(trans, path); + did_leaf_rescans = true; if (err > 0) btrfs_commit_transaction(trans); @@ -3397,16 +3409,23 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has already updated the - * qgroup info. + * Only update status, since the previous part has already updated the + * qgroup info, and only if we did any actual work. This also prevents + * race with a concurrent quota disable, which has already set + * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at + * btrfs_quota_disable(). */ - trans = btrfs_start_transaction(fs_info->quota_root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (did_leaf_rescans) { + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + btrfs_err(fs_info, + "fail to start transaction for status update: %d", + err); + } + } else { trans = NULL; - btrfs_err(fs_info, - "fail to start transaction for status update: %d", - err); } mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ec4a7658ce6f..ef13a9d4e370f0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ - if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b346795f6606..10c26bc8e60e9b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) { sblock->header_error = 1; - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_bytenr(h), + sblock->logical); + goto out; } - if (!scrub_check_fsid(h->fsid, sector)) + if (!scrub_check_fsid(h->fsid, sector)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->fsid, sblock->dev->fs_devices->fsid); + goto out; + } - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->chunk_tree_uuid, fs_info->chunk_tree_uuid); + goto out; + } shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); @@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { sblock->checksum_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + sblock->logical, sblock->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + goto out; + } + + if (sector->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad geneartion, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_generation(h), + sector->generation); + } +out: return sblock->header_error || sblock->checksum_error; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e65e6b6600a744..0b09ac5c447459 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -956,14 +956,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -5635,7 +5633,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5757,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d28ee4e36f3d90..653e8183723c8b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -382,6 +382,8 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { + u64 treelog = 0; + /* * On regular filesystem, all total_bytes are always writable. On zoned * filesystem, there may be a limitation imposed by max_active_zones. @@ -392,7 +394,10 @@ static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) return space_info->total_bytes; - return space_info->active_total_bytes; + if (fs_info->treelog_bg) + treelog = fs_info->zone_size; + + return space_info->active_total_bytes - treelog; } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, @@ -407,7 +412,8 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && + (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) avail = 0; else avail = calc_available_free_space(fs_info, space_info, flush); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 433ce221dc5c79..e5136baef9af17 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -58,6 +58,7 @@ #include "scrub.h" #include "verity.h" #include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8c52e89688c87..528efe559866b8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2604,6 +2604,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 97f6c39f59c8ce..fa728ab8082614 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); @@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fb52aa06009301..997ba92481cb9e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2980,7 +2980,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = 0; if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, ret); btrfs_set_log_full_commit(trans); mutex_unlock(&root->log_mutex); goto out; @@ -3045,15 +3044,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_set_log_full_commit(trans); - - if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, ret); - mutex_unlock(&log_root_tree->log_mutex); - goto out; - } + if (ret != -ENOSPC) + btrfs_err(fs_info, + "failed to update log for root %llu ret %d", + root->root_key.objectid, ret); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3112,7 +3108,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } else if (ret) { btrfs_set_log_full_commit(trans); - btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3581,17 +3576,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { + struct btrfs_root *log = inode->root->log_root; char *ins_data = NULL; struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; unsigned long dst_offset; + u64 last_index; struct btrfs_key key; u32 item_size; int ret; @@ -3649,6 +3646,18 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and force a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; out: kfree(ins_data); @@ -3698,7 +3707,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - ctx->last_dir_item_offset = key.offset; /* * Skip ranges of items that consist only of dir item keys created @@ -3761,7 +3769,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, if (batch_size > 0) { int ret; - ret = flush_dir_items_batch(trans, log, src, dst_path, + ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) return ret; @@ -3785,7 +3793,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - int err = 0; int ret; u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; @@ -3826,7 +3833,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; + } else if (ret > 0) { + ret = 0; } + goto done; } @@ -3846,20 +3856,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { + goto done; } + btrfs_release_path(path); /* - * Find the first key from this transaction again. See the note for - * log_new_dir_dentries, if we're logging a directory recursively we - * won't be holding its i_mutex, which means we can modify the directory - * while we're logging it. If we remove an entry between our first - * search and this search we'll not find the key again and can just - * bail. + * Find the first key from this transaction again or the one we were at + * in the loop below in case we had to reschedule. We may be logging the + * directory without holding its VFS lock, which happen when logging new + * dentries (through log_new_dir_dentries()) or in some cases when we + * need to log the parent directory of an inode. This means a dir index + * key might be deleted from the inode's root, and therefore we may not + * find it anymore. If we can't find it, just move to the next key. We + * can not bail out and ignore, because if we do that we will simply + * not log dir index keys that come after the one that was just deleted + * and we can end up logging a dir index range that ends at (u64)-1 + * (@last_offset is initialized to that), resulting in removing dir + * entries we should not remove at log replay time. */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret != 0) + if (ret > 0) { + ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } + if (ret < 0) goto done; /* @@ -3870,8 +3897,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, &last_old_dentry_offset); if (ret != 0) { - if (ret < 0) - err = ret; + if (ret > 0) + ret = 0; goto done; } path->slots[0] = btrfs_header_nritems(path->nodes[0]); @@ -3882,10 +3909,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ ret = btrfs_next_leaf(root, path); if (ret) { - if (ret == 1) + if (ret == 1) { last_offset = (u64)-1; - else - err = ret; + ret = 0; + } goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); @@ -3916,7 +3943,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - if (err == 0) { + if (ret == 0) { *last_offset_ret = last_offset; /* * In case the leaf was changed in the current transaction but @@ -3927,15 +3954,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * a range, last_old_dentry_offset is == to last_offset. */ ASSERT(last_old_dentry_offset <= last_offset); - if (last_old_dentry_offset < last_offset) { + if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, last_offset); - if (ret) - err = ret; - } } - return err; + + return ret; } /* @@ -4031,7 +4056,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = BTRFS_DIR_START_INDEX; max_key = 0; - ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { ret = log_dir_items(trans, inode, path, dst_path, @@ -4043,8 +4067,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = max_key + 1; } - inode->last_dir_index_offset = ctx->last_dir_item_offset; - return 0; } @@ -6440,7 +6462,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * result in losing the file after a log replay. */ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { - btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85b43075ac58fb..bdeb5216718f4f 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -13,8 +13,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 -/* We can't use the tree log for whatever reason, force a transaction commit */ -#define BTRFS_LOG_FORCE_COMMIT (1) +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; @@ -24,8 +29,6 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - /* Tracks the last logged dir item/index key offset. */ - u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index aa25fa335d3ed1..bcfef75b97da0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -768,8 +768,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, BTRFS_SUPER_FLAG_CHANGING_FSID_V2); error = lookup_bdev(path, &path_devt); - if (error) + if (error) { + btrfs_err(NULL, "failed to lookup block device for path %s: %d", + path, error); return ERR_PTR(error); + } if (fsid_change_in_progress) { if (!has_metadata_uuid) @@ -836,6 +839,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, unsigned int nofs_flag; if (fs_devices->opened) { + btrfs_err(NULL, + "device %s belongs to fsid %pU, and the fs is already mounted", + path, fs_devices->fsid); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } @@ -905,6 +911,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, * generation are equal. */ mutex_unlock(&fs_devices->device_list_mutex); + btrfs_err(NULL, +"device %s already registered with a higher generation, found %llu expect %llu", + path, found_transid, device->generation); return ERR_PTR(-EEXIST); } @@ -2005,42 +2014,42 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } +static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, + struct block_device *bdev, int copy_num) +{ + struct btrfs_super_block *disk_super; + const size_t len = sizeof(disk_super->magic); + const u64 bytenr = btrfs_sb_offset(copy_num); + int ret; + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + if (IS_ERR(disk_super)) + return; + + memset(&disk_super->magic, 0, len); + folio_mark_dirty(virt_to_folio(disk_super)); + btrfs_release_disk_super(disk_super); + + ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); + if (ret) + btrfs_warn(fs_info, "error clearing superblock number %d (%d)", + copy_num, ret); +} + void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct block_device *bdev, const char *device_path) { - struct btrfs_super_block *disk_super; int copy_num; if (!bdev) return; for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { - struct page *page; - int ret; - - disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); - if (IS_ERR(disk_super)) - continue; - - if (bdev_is_zoned(bdev)) { + if (bdev_is_zoned(bdev)) btrfs_reset_sb_log_zones(bdev, copy_num); - continue; - } - - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - - page = virt_to_page(disk_super); - set_page_dirty(page); - lock_page(page); - /* write_on_page() unlocks the page */ - ret = write_one_page(page); - if (ret) - btrfs_warn(fs_info, - "error clearing superblock number %d (%d)", - copy_num, ret); - btrfs_release_disk_super(disk_super); - + else + btrfs_scratch_superblock(fs_info, bdev, copy_num); } /* Notify udev that device has changed */ diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a759668477bb2e..c5a67d21b66d7f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -18,6 +18,13 @@ #include "fs.h" #include "accessors.h" +/* Used to cache the necessary information from blk_zone */ +struct cached_zone { + u64 wp; + u64 capacity; + u8 cond; +}; + /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 /* Invalid allocation pointer value for missing devices */ @@ -160,7 +167,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, */ static inline u32 sb_zone_number(int shift, int mirror) { - u64 zone; + u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { @@ -220,7 +227,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; - u32 zno; + struct cached_zone *zone_cache; int ret; if (!*nr_zones) @@ -234,6 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* Check cache */ if (zinfo->zone_cache) { + u32 zno; + const u8 zone_sectors_shift = zinfo->zone_size_shift - SECTOR_SHIFT; unsigned int i; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); @@ -245,19 +254,26 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); for (i = 0; i < *nr_zones; i++) { - struct blk_zone *zone_info; + zone_cache = &zinfo->zone_cache[zno + i]; - zone_info = &zinfo->zone_cache[zno + i]; - if (!zone_info->len) - break; - } + zones[i].start = (__u64)(zno + i) << zone_sectors_shift; + zones[i].len = (__u64)1 << zone_sectors_shift; + zones[i].wp = zone_cache->wp; + zones[i].cond = zone_cache->cond; + zones[i].capacity = zone_cache->capacity; - if (i == *nr_zones) { - /* Cache hit on all the zones */ - memcpy(zones, zinfo->zone_cache + zno, - sizeof(*zinfo->zone_cache) * *nr_zones); - return 0; + /* + * We don't distinguish SEQWRITE_PREF vs SEQWRITE_REQ in + * zoned btrfs, so we can safely set + * BLK_ZONE_TYPE_SEQWRITE_REQ. + */ + if (test_bit(zno + i, zinfo->seq_zones)) + zones[i].type = BLK_ZONE_TYPE_SEQWRITE_REQ; + else + zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; } + + return 0; } ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, @@ -273,11 +289,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!ret) return -EIO; - /* Populate cache */ - if (zinfo->zone_cache) - memcpy(zinfo->zone_cache + zno, zones, - sizeof(*zinfo->zone_cache) * *nr_zones); - return 0; } @@ -350,17 +361,95 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) return ret; } +static int populate_zone_info(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct btrfs_device *device = data; + struct btrfs_zoned_device_info *zone_info = device->zone_info; + int i; + + /* Populate bitmaps and track active zones */ + if (zone->type == BLK_ZONE_TYPE_SEQWRITE_REQ) + __set_bit(idx, zone_info->seq_zones); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + __set_bit(idx, zone_info->empty_zones); + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + __set_bit(idx, zone_info->active_zones); + if (zone_info->max_active_zones && + atomic_dec_return(&zone_info->active_zones_left) < 0) { + btrfs_err_in_rcu(device->fs_info, + "zoned: active zones on %s exceeds max_active_zones %u", + rcu_str_deref(device->name), + zone_info->max_active_zones); + return -EIO; + } + /* Overcommit does not work well with active zone tacking. */ + if (zone_info->max_active_zones) + set_bit(BTRFS_FS_NO_OVERCOMMIT, &device->fs_info->flags); + break; + } + + /* Copy and validate superblock zones */ + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + int sb_pos; + u32 sb_zone; + struct blk_zone *sb_zones; + + sb_zone = sb_zone_number(zone_info->zone_size_shift, i); + if (idx < sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= idx) + continue; + + sb_pos = BTRFS_NR_SB_LOG_ZONES * i + (idx - sb_zone); + sb_zones = &zone_info->sb_zones[sb_pos]; + memcpy(sb_zones, zone, sizeof(*zone)); + + if (idx == sb_zone + BTRFS_NR_SB_LOG_ZONES - 1) { + u64 sb_wp; + int ret; + + /* + * If sb_zones[0] is conventional, always use the + * beginning of the zone to record superblock. No need + * to validate in that case. + */ + if (sb_zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) + continue; + + ret = sb_write_pointer(device->bdev, sb_zones, &sb_wp); + if (ret != -ENOENT && ret) { + btrfs_err_in_rcu( + device->fs_info, + "zoned: super block log zone corrupted devid %llu zone %u", + device->devid, sb_zone); + return -EUCLEAN; + } + } + } + + /* Populate zone cache */ + if (zone_info->zone_cache) { + struct cached_zone *cache = &zone_info->zone_cache[idx]; + + cache->wp = zone->wp; + cache->cond = zone->cond; + cache->capacity = zone->capacity; + } + + return 0; +} + int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; unsigned int max_active_zones; - unsigned int nactive; sector_t nr_sectors; - sector_t sector = 0; - struct blk_zone *zones = NULL; - unsigned int i, nreported = 0, nr_zones; sector_t zone_sectors; char *model, *emulated; int ret; @@ -449,6 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } zone_info->max_active_zones = max_active_zones; + atomic_set(&zone_info->active_zones_left, max_active_zones); zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { @@ -468,20 +558,15 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } - zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); - if (!zones) { - ret = -ENOMEM; - goto out; - } - /* * Enable zone cache only for a zoned device. On a non-zoned device, we * fill the zone info with emulated CONVENTIONAL zones, so no need to * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { - zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * - zone_info->nr_zones); + zone_info->zone_cache = kvmalloc_array(zone_info->nr_zones, + sizeof(struct cached_zone), + GFP_KERNEL); if (!zone_info->zone_cache) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to allocate zone cache for %s", @@ -491,104 +576,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } } - /* Get zones type */ - nactive = 0; - while (sector < nr_sectors) { - nr_zones = BTRFS_REPORT_NR_ZONES; - ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, - &nr_zones); - if (ret) + if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) { + /* Report all the zones and popoulate zone_info's structures */ + ret = blkdev_report_zones(device->bdev, 0, BLK_ALL_ZONES, + populate_zone_info, device); + if (ret < 0) goto out; - for (i = 0; i < nr_zones; i++) { - if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) - __set_bit(nreported, zone_info->seq_zones); - switch (zones[i].cond) { - case BLK_ZONE_COND_EMPTY: - __set_bit(nreported, zone_info->empty_zones); - break; - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - __set_bit(nreported, zone_info->active_zones); - nactive++; - break; - } - nreported++; - } - sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; - } - - if (nreported != zone_info->nr_zones) { - btrfs_err_in_rcu(device->fs_info, - "inconsistent number of zones on %s (%u/%u)", - rcu_str_deref(device->name), nreported, - zone_info->nr_zones); - ret = -EIO; - goto out; - } - - if (max_active_zones) { - if (nactive > max_active_zones) { + if (ret != zone_info->nr_zones) { btrfs_err_in_rcu(device->fs_info, - "zoned: %u active zones on %s exceeds max_active_zones %u", - nactive, rcu_str_deref(device->name), - max_active_zones); + "inconsistent number of zones on %s (%u/%u)", + rcu_str_deref(device->name), ret, + zone_info->nr_zones); ret = -EIO; goto out; } - atomic_set(&zone_info->active_zones_left, - max_active_zones - nactive); } - /* Validate superblock log */ - nr_zones = BTRFS_NR_SB_LOG_ZONES; - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - u32 sb_zone; - u64 sb_wp; - int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; - - sb_zone = sb_zone_number(zone_info->zone_size_shift, i); - if (sb_zone + 1 >= zone_info->nr_zones) - continue; - - ret = btrfs_get_dev_zones(device, - zone_start_physical(sb_zone, zone_info), - &zone_info->sb_zones[sb_pos], - &nr_zones); - if (ret) - goto out; - - if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { - btrfs_err_in_rcu(device->fs_info, - "zoned: failed to read super block log zone info at devid %llu zone %u", - device->devid, sb_zone); - ret = -EUCLEAN; - goto out; - } - - /* - * If zones[0] is conventional, always use the beginning of the - * zone to record superblock. No need to validate in that case. - */ - if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == - BLK_ZONE_TYPE_CONVENTIONAL) - continue; - - ret = sb_write_pointer(device->bdev, - &zone_info->sb_zones[sb_pos], &sb_wp); - if (ret != -ENOENT && ret) { - btrfs_err_in_rcu(device->fs_info, - "zoned: super block log zone corrupted devid %llu zone %u", - device->devid, sb_zone); - ret = -EUCLEAN; - goto out; - } - } - - - kvfree(zones); - switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: model = "host-managed zoned"; @@ -608,7 +612,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) bdev_zoned_model(bdev), rcu_str_deref(device->name)); ret = -EOPNOTSUPP; - goto out_free_zone_info; + goto out; } btrfs_info_in_rcu(fs_info, @@ -619,8 +623,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) return 0; out: - kvfree(zones); -out_free_zone_info: btrfs_destroy_dev_zone_info(device); return ret; @@ -636,7 +638,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); - vfree(zone_info->zone_cache); + kvfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } @@ -1546,6 +1548,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (!ret) { + u64 avail = cache->zone_capacity - cache->alloc_offset; + int bucket = btrfs_zoned_alloc_bucket(fs_info, avail); + cache->meta_write_pointer = cache->alloc_offset + cache->start; if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { btrfs_get_block_group(cache); @@ -1554,6 +1559,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); } + if (bucket != -1) { + spin_lock(&fs_info->zone_alloc_list_lock); + list_add_tail(&cache->zoned_alloc_list, + &fs_info->zone_alloc_list[bucket]); + spin_unlock(&fs_info->zone_alloc_list_lock); + } } else { kfree(cache->physical_map); cache->physical_map = NULL; @@ -2036,6 +2047,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); + spin_lock(&fs_info->zone_alloc_list_lock); + if (!list_empty(&block_group->zoned_alloc_list)) + list_del_init(&block_group->zoned_alloc_list); + spin_unlock(&fs_info->zone_alloc_list_lock); spin_unlock(&block_group->lock); map = block_group->physical_map; @@ -2193,7 +2208,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->zone_info) { - vfree(device->zone_info->zone_cache); + kvfree(device->zone_info->zone_cache); device->zone_info->zone_cache = NULL; } } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f43990985d802c..b7930fdc11d9a8 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -13,6 +13,7 @@ #define BTRFS_DEFAULT_RECLAIM_THRESH (75) +struct cached_zone; struct btrfs_zoned_device_info { /* * Number of zones, zone size and types of zones if bdev is a @@ -27,7 +28,7 @@ struct btrfs_zoned_device_info { unsigned long *seq_zones; unsigned long *empty_zones; unsigned long *active_zones; - struct blk_zone *zone_cache; + struct cached_zone *zone_cache; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -418,4 +419,21 @@ static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) return (bg->alloc_offset == bg->zone_capacity); } +static inline int btrfs_zoned_alloc_bucket(struct btrfs_fs_info *fs_info, u64 avail) +{ + u64 bucket_size; + unsigned long bucket_size_shift; + + if (avail == 0) + return -1; + if (avail == fs_info->zone_size) + return BTRFS_NUM_ZONE_ALLOC_LIST - 1; + + // unsigned long max_bit = __ffs64(fs_info->zone_size); + // unsigned long first_bit = __ffs64(avail); + bucket_size = fs_info->zone_size >> const_ilog2(BTRFS_NUM_ZONE_ALLOC_LIST); + bucket_size_shift = ilog2(bucket_size); + return avail >> bucket_size_shift; +} + #endif diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 6548b5b5aa6080..75d7d22c3a276c 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -32,6 +32,7 @@ struct prelim_ref; struct btrfs_space_info; struct btrfs_raid_bio; struct raid56_bio_trace_info; +struct find_free_extent_ctl; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_root *root, u64 num_bytes, - u64 empty_size, u64 data), + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(root, num_bytes, empty_size, data), + TP_ARGS(root, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) - __field( u64, data ) + __field( u64, flags ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; - __entry->num_bytes = num_bytes; - __entry->empty_size = empty_size; - __entry->data = data; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", show_root_type(__entry->root_objectid), - __entry->num_bytes, __entry->empty_size, __entry->data, - __print_flags((unsigned long)__entry->data, "|", + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS)) +); + +TRACE_EVENT(find_free_extent_search_loop, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl), + + TP_ARGS(root, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop) +); + +TRACE_EVENT(find_free_extent_have_block_group, + + TP_PROTO(const struct btrfs_root *root, + const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *block_group), + + TP_ARGS(root, ffe_ctl, block_group), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, flags ) + __field( u64, loop ) + __field( bool, hinted ) + __field( u64, bg_start ) + __field( u64, bg_flags ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = ffe_ctl->num_bytes; + __entry->empty_size = ffe_ctl->empty_size; + __entry->flags = ffe_ctl->flags; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->bg_start = block_group->start; + __entry->bg_flags = block_group->flags; + ), + + TP_printk_btrfs( +"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", + show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->loop, __entry->hinted, + __entry->bg_start, __entry->bg_flags, + __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS)) ); DECLARE_EVENT_CLASS(btrfs__reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len), + TP_ARGS(block_group, ffe_ctl), TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) + __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) + __field( u64, loop ) + __field( bool, hinted ) + __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; - __entry->start = start; - __entry->len = len; + __entry->bg_size_class = block_group->size_class; + __entry->start = ffe_ctl->search_start; + __entry->len = ffe_ctl->num_bytes; + __entry->loop = ffe_ctl->loop; + __entry->hinted = ffe_ctl->hinted; + __entry->size_class = ffe_ctl->size_class; ), - TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " - "start=%llu len=%llu", + TP_printk_btrfs( +"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len) + __entry->bg_size_class, __entry->start, __entry->len, + __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - TP_PROTO(const struct btrfs_block_group *block_group, u64 start, - u64 len), + TP_PROTO(const struct btrfs_block_group *block_group, + const struct find_free_extent_ctl *ffe_ctl), - TP_ARGS(block_group, start, len) + TP_ARGS(block_group, ffe_ctl) ); TRACE_EVENT(btrfs_find_cluster,