From 240450957e75747df4de9f64e2055557f7bf8d41 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 12 Dec 2022 01:02:46 -0800
Subject: [PATCH 01/48] btrfs: drop unused trans parameter of drop_delayed_ref

drop_delayed_ref() doesn't use the btrfs_trans_handle it gets passed in,
so remove it.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 573ebab886e231..663e7493926fea 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
 	return 0;
 }
 
-static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
-				    struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
 				    struct btrfs_delayed_ref_head *head,
 				    struct btrfs_delayed_ref_node *ref)
 {
@@ -482,10 +481,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 			mod = -next->ref_mod;
 		}
 
-		drop_delayed_ref(trans, delayed_refs, head, next);
+		drop_delayed_ref(delayed_refs, head, next);
 		ref->ref_mod += mod;
 		if (ref->ref_mod == 0) {
-			drop_delayed_ref(trans, delayed_refs, head, ref);
+			drop_delayed_ref(delayed_refs, head, ref);
 			done = true;
 		} else {
 			/*
@@ -641,7 +640,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
 
 	/* remove existing tail if its ref_mod is zero */
 	if (exist->ref_mod == 0)
-		drop_delayed_ref(trans, root, href, exist);
+		drop_delayed_ref(root, href, exist);
 	spin_unlock(&href->lock);
 	return ret;
 inserted:

From 605a44664ecbda4ad93a540d6c95563e9f36ae45 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 12 Dec 2022 01:02:47 -0800
Subject: [PATCH 02/48] btrfs: remove trans parameter of merge_ref

Now that drop_delayed_ref() doesn't get the btrfs_trans_handle passed in
anymore, we can get rid of it in merge_ref() as well.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 663e7493926fea..046ba49b8f9488 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -451,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
 	atomic_dec(&delayed_refs->num_entries);
 }
 
-static bool merge_ref(struct btrfs_trans_handle *trans,
-		      struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
 		      struct btrfs_delayed_ref_head *head,
 		      struct btrfs_delayed_ref_node *ref,
 		      u64 seq)
@@ -523,7 +522,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		if (seq && ref->seq >= seq)
 			continue;
-		if (merge_ref(trans, delayed_refs, head, ref, seq))
+		if (merge_ref(delayed_refs, head, ref, seq))
 			goto again;
 	}
 }

From bccf28752a99d55b4bf8db00dedd868109640e14 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 12 Dec 2022 01:02:48 -0800
Subject: [PATCH 03/48] btrfs: drop trans parameter of insert_delayed_ref

Now that drop_delayed_ref() doesn't need a btrfs_trans_handle, drop it
from insert_delayed_ref() as well.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 046ba49b8f9488..678ce95c04ac62 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -599,8 +599,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
  * Return 0 for insert.
  * Return >0 for merge.
  */
-static int insert_delayed_ref(struct btrfs_trans_handle *trans,
-			      struct btrfs_delayed_ref_root *root,
+static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
 			      struct btrfs_delayed_ref_head *href,
 			      struct btrfs_delayed_ref_node *ref)
 {
@@ -976,7 +975,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	head_ref = add_delayed_ref_head(trans, head_ref, record,
 					action, &qrecord_inserted);
 
-	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
+	ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
 	/*
@@ -1068,7 +1067,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	head_ref = add_delayed_ref_head(trans, head_ref, record,
 					action, &qrecord_inserted);
 
-	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
+	ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
 	/*

From 87807cc366f8faaeb17528d22d1a5cf2296d1c07 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 12 Dec 2022 01:02:49 -0800
Subject: [PATCH 04/48] btrfs: directly pass in fs_info to
 btrfs_merge_delayed_refs

Now that none of the functions called by btrfs_merge_delayed_refs() needs
a btrfs_trans_handle, directly pass in a btrfs_fs_info to
btrfs_merge_delayed_refs().

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 3 +--
 fs/btrfs/delayed-ref.h | 2 +-
 fs/btrfs/extent-tree.c | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 678ce95c04ac62..886ffb232eac2a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -497,11 +497,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
 	return done;
 }
 
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
 			      struct btrfs_delayed_ref_root *delayed_refs,
 			      struct btrfs_delayed_ref_head *head)
 {
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_node *ref;
 	struct rb_node *node;
 	u64 seq = 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d6304b690ec4a1..2eb34abf700ff4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
 			      struct btrfs_delayed_ref_root *delayed_refs,
 			      struct btrfs_delayed_ref_head *head);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72ba13b027a9e6..d1a4e51f8fbc1c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1966,7 +1966,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 		cond_resched();
 
 		spin_lock(&locked_ref->lock);
-		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
 	}
 
 	return 0;
@@ -2013,7 +2013,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 * insert_inline_extent_backref()).
 		 */
 		spin_lock(&locked_ref->lock);
-		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
 
 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
 						      &actual_count);

From bfbf602bdd3b3b55b13c3799878eba8d9c01fef2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 7 Dec 2022 10:18:04 -0500
Subject: [PATCH 05/48] btrfs: move btrfs_abort_transaction to transaction.c

While trying to sync messages.[ch] I ended up with this dependency on
messages.h in the rest of btrfs-progs code base because it's where
btrfs_abort_transaction() was now held.  We want to keep messages.[ch]
limited to the kernel code, and the btrfs_abort_transaction() code
better fits in the transaction code and not in messages.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ move the __cold attributes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/messages.c    | 30 ------------------------------
 fs/btrfs/messages.h    | 34 ----------------------------------
 fs/btrfs/transaction.c | 29 +++++++++++++++++++++++++++++
 fs/btrfs/transaction.h | 31 +++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 64 deletions(-)

diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 625bbbbb2608d8..fde5aaa6e7c956 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
 }
 #endif
 
-/*
- * We only mark the transaction aborted and then set the file system read-only.
- * This will prevent new transactions from starting or trying to join this
- * one.
- *
- * This means that error recovery at the call site is limited to freeing
- * any local memory allocations and passing the error code up without
- * further cleanup. The transaction should complete as it normally would
- * in the call path but will return -EIO.
- *
- * We'll complete the cleanup in btrfs_end_transaction and
- * btrfs_commit_transaction.
- */
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-			       const char *function,
-			       unsigned int line, int errno, bool first_hit)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-
-	WRITE_ONCE(trans->aborted, errno);
-	WRITE_ONCE(trans->transaction->aborted, errno);
-	if (first_hit && errno == -ENOSPC)
-		btrfs_dump_space_info_for_trans_abort(fs_info);
-	/* Wake up anybody who may be waiting on this transaction */
-	wake_up(&fs_info->transaction_wait);
-	wake_up(&fs_info->transaction_blocked_wait);
-	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
-}
-
 /*
  * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
  * alert, and either panics or BUGs, depending on mount options.
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 190af1f698d9ab..8c516ee58ff95b 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -6,7 +6,6 @@
 #include <linux/types.h>
 
 struct btrfs_fs_info;
-struct btrfs_trans_handle;
 
 static inline __printf(2, 3) __cold
 void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
@@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 
 const char * __attribute_const__ btrfs_decode_error(int errno);
 
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-			       const char *function,
-			       unsigned int line, int errno, bool first_hit);
-
-bool __cold abort_should_print_stack(int errno);
-
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact stack trace is reported for some errors.
- */
-#define btrfs_abort_transaction(trans, errno)			\
-do {								\
-	bool first = false;					\
-	/* Report first abort since mount */			\
-	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
-			      &((trans)->fs_info->fs_state))) {	\
-		first = true;					\
-		if (WARN(abort_should_print_stack(errno),       \
-			KERN_ERR				\
-			"BTRFS: Transaction aborted (error %d)\n",	\
-			(errno))) {					\
-			/* Stack trace printed. */			\
-		} else {						\
-			btrfs_err((trans)->fs_info,			\
-				  "Transaction aborted (error %d)",     \
-				  (errno));			\
-		}						\
-	}							\
-	__btrfs_abort_transaction((trans), __func__,		\
-				  __LINE__, (errno), first);	\
-} while (0)
-
 #define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
 	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
 				(errno), fmt, ##args)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b8c52e89688c87..528efe559866b8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2604,6 +2604,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
 	return (ret < 0) ? 0 : 1;
 }
 
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+				      const char *function,
+				      unsigned int line, int errno, bool first_hit)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+
+	WRITE_ONCE(trans->aborted, errno);
+	WRITE_ONCE(trans->transaction->aborted, errno);
+	if (first_hit && errno == -ENOSPC)
+		btrfs_dump_space_info_for_trans_abort(fs_info);
+	/* Wake up anybody who may be waiting on this transaction */
+	wake_up(&fs_info->transaction_wait);
+	wake_up(&fs_info->transaction_blocked_wait);
+	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+}
+
 int __init btrfs_transaction_init(void)
 {
 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 97f6c39f59c8ce..fa728ab8082614 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
 	delayed_refs->qgroup_to_skip = 0;
 }
 
+bool __cold abort_should_print_stack(int errno);
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact stack trace is reported for some errors.
+ */
+#define btrfs_abort_transaction(trans, errno)		\
+do {								\
+	bool first = false;					\
+	/* Report first abort since mount */			\
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
+			&((trans)->fs_info->fs_state))) {	\
+		first = true;					\
+		if (WARN(abort_should_print_stack(errno),	\
+			KERN_ERR				\
+			"BTRFS: Transaction aborted (error %d)\n",	\
+			(errno))) {					\
+			/* Stack trace printed. */			\
+		} else {						\
+			btrfs_debug((trans)->fs_info,			\
+				    "Transaction aborted (error %d)", \
+				  (errno));			\
+		}						\
+	}							\
+	__btrfs_abort_transaction((trans), __func__,		\
+				  __LINE__, (errno), first);	\
+} while (0)
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   unsigned int num_items);
@@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
 void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+				      const char *function,
+				      unsigned int line, int errno, bool first_hit);
 
 int __init btrfs_transaction_init(void);
 void __cold btrfs_transaction_exit(void);

From bd8b4d2e60b8ad62e4e968bdde9acc5c65a3bb93 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:52 -0500
Subject: [PATCH 06/48] btrfs: fix uninitialized variable warning in
 btrfs_cleanup_ordered_extents

We can conditionally pass in a locked page, and then we'll use that page
range to skip marking errors as that will happen in another layer.
However this causes the compiler to complain because it doesn't
understand we only use these values when we have the page.  Make the
compiler stop complaining by setting these values to 0.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 98a800b8bd438b..77c2acc06891f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -228,7 +228,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 {
 	unsigned long index = offset >> PAGE_SHIFT;
 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
-	u64 page_start, page_end;
+	u64 page_start = 0, page_end = 0;
 	struct page *page;
 
 	if (locked_page) {

From edadfc69d816ab62667330a5ef1efd6d2a8982de Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:53 -0500
Subject: [PATCH 07/48] btrfs: fix uninitialized variable warning in
 get_inode_gen

Anybody that calls get_inode_gen() can have an uninitialized gen if
there's an error.  This isn't a big deal because all the users just exit
if they get an error, however it makes -Wmaybe-uninitialized complain,
so fix this up to always initialize the passed in gen, this quiets all
of the uninitialized warnings in send.c.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e65e6b6600a744..b90ad6f7219c1f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -956,14 +956,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
 static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
 {
 	int ret;
-	struct btrfs_inode_info info;
+	struct btrfs_inode_info info = { 0 };
 
-	if (!gen)
-		return -EPERM;
+	ASSERT(gen);
 
 	ret = get_inode_info(root, ino, &info);
-	if (!ret)
-		*gen = info.gen;
+	*gen = info.gen;
 	return ret;
 }
 

From 4784c21fe7f17b5e5920f4d02f135378fe8a9d52 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:54 -0500
Subject: [PATCH 08/48] btrfs: fix uninitialized variable warning in
 btrfs_update_block_group

reclaim isn't set in the alloc case, however we only care about
reclaim in the !alloc case.  This isn't an actual problem, however
-Wmaybe-uninitialized will complain, so initialize reclaim to quiet the
compiler.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 708d843daa72de..e90800388a413f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3330,7 +3330,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&info->delalloc_root_lock);
 
 	while (total) {
-		bool reclaim;
+		bool reclaim = false;
 
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {

From ca268991a4c4a7943dbd88d418b6ac5142998204 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:55 -0500
Subject: [PATCH 09/48] btrfs: fix uninitialized variable warnings in
 __set_extent_bit and convert_extent_bit

We will pass in the parent and p pointer into our tree_search function
to avoid doing a second search when inserting a new extent state into
the tree.  However because this is conditional upon passing in these
pointers the compiler seems to think these values can be uninitialized
if we're using -Wmaybe-uninitialized.  Fix this by initializing these
values.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 3c7766dfaa694a..b2bab76723404b 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
-	struct rb_node **p;
-	struct rb_node *parent;
+	struct rb_node **p = NULL;
+	struct rb_node *parent = NULL;
 	int err = 0;
 	u64 last_start;
 	u64 last_end;
@@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
-	struct rb_node **p;
-	struct rb_node *parent;
+	struct rb_node **p = NULL;
+	struct rb_node *parent = NULL;
 	int err = 0;
 	u64 last_start;
 	u64 last_end;

From 8f02e85278e9c5f038d615a925ac15392828dfb8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:57 -0500
Subject: [PATCH 10/48] btrfs: fix uninitialized variable warning in
 btrfs_sb_log_location

We only have 3 possible mirrors, and we have ASSERT()'s to make sure
we're not passing in an invalid super mirror into this function, so
technically this value isn't uninitialized.  However
-Wmaybe-uninitialized will complain, so set it to U64_MAX so if we don't
have ASSERT()'s turned on it'll error out later on when it see's the
zone is beyond our maximum zones.

Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index a759668477bb2e..3fbd8bd7ae7178 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -160,7 +160,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
  */
 static inline u32 sb_zone_number(int shift, int mirror)
 {
-	u64 zone;
+	u64 zone = U64_MAX;
 
 	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 	switch (mirror) {

From aee31508ef592d35173209b2602f684b7c263a17 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <Naohiro.Aota@wdc.com>
Date: Wed, 21 Dec 2022 16:47:45 +0000
Subject: [PATCH 11/48] btrfs: zoned: fix uninitialized variable warning in
 btrfs_get_dev_zones

Fix an uninitialized warning we get with -Wmaybe-uninitialized where it
thought zno may have been uninitialized, in both cases it depends on
zinfo->zone_cache but we know the value won't change between checks.

Reported-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/linux-btrfs/af6c527cbd8bdc782e50bd33996ee83acc3a16fb.1671221596.git.josef@toxicpanda.com/
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 3fbd8bd7ae7178..bca9feb34c0cd4 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -220,7 +220,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
-	u32 zno;
 	int ret;
 
 	if (!*nr_zones)
@@ -235,6 +234,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 	/* Check cache */
 	if (zinfo->zone_cache) {
 		unsigned int i;
+		u32 zno;
 
 		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
 		zno = pos >> zinfo->zone_size_shift;
@@ -274,9 +274,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 		return -EIO;
 
 	/* Populate cache */
-	if (zinfo->zone_cache)
+	if (zinfo->zone_cache) {
+		u32 zno = pos >> zinfo->zone_size_shift;
+
 		memcpy(zinfo->zone_cache + zno, zones,
 		       sizeof(*zinfo->zone_cache) * *nr_zones);
+	}
 
 	return 0;
 }

From f06f7d1a50833c1ad73a21a06b3f4e445915f860 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:51 -0500
Subject: [PATCH 12/48] btrfs: fix uninitialized variable warning in
 run_one_async_start

With -Wmaybe-uninitialized compiler complains about ret being possibly
uninitialized, which isn't possible as the WQ_ constants are set only
from our code, however we can handle the default case and get rid of the
warning.

The value is set to BLK_STS_IOERR so it does not issue any IO and could
be potentially detected, but this is basically a "cannot happen" error.
To catch any problems during development use the assert.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ set the error in default: ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8aeaada1fcaec2..a7d5a3967ba0f3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -710,6 +710,10 @@ static void run_one_async_start(struct btrfs_work *work)
 		ret = btrfs_submit_bio_start_direct_io(async->inode,
 				async->bio, async->dio_file_offset);
 		break;
+	default:
+		/* Can't happen so return something that would prevent the IO. */
+		ret = BLK_STS_IOERR;
+		ASSERT(0);
 	}
 	if (ret)
 		async->status = ret;

From acbe1be734c948f4e552f435a55d4eecd6b38808 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Dec 2022 15:15:58 -0500
Subject: [PATCH 13/48] btrfs: turn on -Wmaybe-uninitialized

We had a recent bug that would have been caught by a newer compiler with
-Wmaybe-uninitialized and would have saved us a month of failing tests
that I didn't have time to investigate.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 555c962fdad66b..460eced3f5bd01 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,8 @@ condflags := \
 	$(call cc-option, -Wunused-but-set-variable)		\
 	$(call cc-option, -Wunused-const-variable)		\
 	$(call cc-option, -Wpacked-not-aligned)			\
-	$(call cc-option, -Wstringop-truncation)
+	$(call cc-option, -Wstringop-truncation)		\
+	$(call cc-option, -Wmaybe-uninitialized)
 subdir-ccflags-y += $(condflags)
 # The following turn off the warnings enabled by -Wextra
 subdir-ccflags-y += -Wno-missing-field-initializers

From 8de55eecfc075b4838614381613e2c6bba886fe1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Nov 2022 18:47:48 +0100
Subject: [PATCH 14/48] btrfs: factor out scratching of one regular super block

btrfs_scratch_superblocks open codes scratching super block of a
non-zoned super block.  Split the code to read, zero and write the
superblock for regular devices into a separate helper.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 51 +++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aa25fa335d3ed1..3ac1496abbc927 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2005,42 +2005,43 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
 	return num_devices;
 }
 
+static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
+				     struct block_device *bdev, int copy_num)
+{
+	struct btrfs_super_block *disk_super;
+	struct page *page;
+	int ret;
+
+	disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
+	if (IS_ERR(disk_super))
+		return;
+
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	page = virt_to_page(disk_super);
+	set_page_dirty(page);
+	lock_page(page);
+	/* write_on_page() unlocks the page */
+	ret = write_one_page(page);
+	if (ret)
+		btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
+			copy_num, ret);
+	btrfs_release_disk_super(disk_super);
+}
+
 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 			       struct block_device *bdev,
 			       const char *device_path)
 {
-	struct btrfs_super_block *disk_super;
 	int copy_num;
 
 	if (!bdev)
 		return;
 
 	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
-		struct page *page;
-		int ret;
-
-		disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
-		if (IS_ERR(disk_super))
-			continue;
-
-		if (bdev_is_zoned(bdev)) {
+		if (bdev_is_zoned(bdev))
 			btrfs_reset_sb_log_zones(bdev, copy_num);
-			continue;
-		}
-
-		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-
-		page = virt_to_page(disk_super);
-		set_page_dirty(page);
-		lock_page(page);
-		/* write_on_page() unlocks the page */
-		ret = write_one_page(page);
-		if (ret)
-			btrfs_warn(fs_info,
-				"error clearing superblock number %d (%d)",
-				copy_num, ret);
-		btrfs_release_disk_super(disk_super);
-
+		else
+			btrfs_scratch_superblock(fs_info, bdev, copy_num);
 	}
 
 	/* Notify udev that device has changed */

From fa78fc1f56f9113ea36fa7e1034e13ab1be6b425 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Nov 2022 18:47:49 +0100
Subject: [PATCH 15/48] btrfs: stop using write_one_page in
 btrfs_scratch_superblock

write_one_page is an awkward interface that expects the page locked and
->writepage to be implemented.  Replace that by zeroing the signature
bytes and synchronize the block device page using the proper bdev
helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ac1496abbc927..13181f12190100 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2009,23 +2009,22 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
 				     struct block_device *bdev, int copy_num)
 {
 	struct btrfs_super_block *disk_super;
-	struct page *page;
+	const size_t len = sizeof(disk_super->magic);
+	const u64 bytenr = btrfs_sb_offset(copy_num);
 	int ret;
 
-	disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
+	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
 	if (IS_ERR(disk_super))
 		return;
 
-	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-	page = virt_to_page(disk_super);
-	set_page_dirty(page);
-	lock_page(page);
-	/* write_on_page() unlocks the page */
-	ret = write_one_page(page);
+	memset(&disk_super->magic, 0, len);
+	folio_mark_dirty(virt_to_folio(disk_super));
+	btrfs_release_disk_super(disk_super);
+
+	ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
 	if (ret)
 		btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
 			copy_num, ret);
-	btrfs_release_disk_super(disk_super);
 }
 
 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,

From 7175b01074ff60f70feb4f4b4e6142e05e2de3c3 Mon Sep 17 00:00:00 2001
From: Peng Hao <flyingpeng@tencent.com>
Date: Mon, 9 Jan 2023 21:08:31 +0100
Subject: [PATCH 16/48] btrfs: go to matching label when cleaning em in
 btrfs_submit_direct

When btrfs_get_chunk_map fails to allocate a new em the cleanup does not
need to be done so the goto target is out_err, which is consistent with
current coding style.

Signed-off-by: Peng Hao <flyingpeng@tencent.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 77c2acc06891f8..7fa1db6a474ab2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8080,7 +8080,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
 		if (IS_ERR(em)) {
 			status = errno_to_blk_status(PTR_ERR(em));
 			em = NULL;
-			goto out_err_em;
+			goto out_err;
 		}
 		ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
 					    logical, &geom);

From 5d670d52a06cafcee243796013d914be4d5e18b1 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 12 Dec 2022 10:19:37 +0800
Subject: [PATCH 17/48] btrfs: add extra error messages to cover non-ENOMEM
 errors from device_add_list()

[BUG]
When test case btrfs/219 (aka, mount a registered device but with a lower
generation) failed, there is not any useful information for the end user
to find out what's going wrong.

The mount failure just looks like this:

  #  mount -o loop /tmp/219.img2 /mnt/btrfs/
  mount: /mnt/btrfs: mount(2) system call failed: File exists.
         dmesg(1) may have more information after failed mount system call.

While the dmesg contains nothing but the loop device change:

  loop1: detected capacity change from 0 to 524288

[CAUSE]
In device_list_add() we have a lot of extra checks to reject invalid
cases.

That function also contains the regular device scan result like the
following prompt:

  BTRFS: device fsid 6222333e-f9f1-47e6-b306-55ddd4dcaef4 devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (3027)

But unfortunately not all errors have their own error messages, thus if
we hit something wrong in device_add_list(), there may be no error
messages at all.

[FIX]
Add errors message for all non-ENOMEM errors.

For ENOMEM, I'd say we're in a much worse situation, and there should be
some OOM messages way before our call sites.

CC: stable@vger.kernel.org # 6.0+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 13181f12190100..bcfef75b97da0d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -768,8 +768,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 
 	error = lookup_bdev(path, &path_devt);
-	if (error)
+	if (error) {
+		btrfs_err(NULL, "failed to lookup block device for path %s: %d",
+			  path, error);
 		return ERR_PTR(error);
+	}
 
 	if (fsid_change_in_progress) {
 		if (!has_metadata_uuid)
@@ -836,6 +839,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		unsigned int nofs_flag;
 
 		if (fs_devices->opened) {
+			btrfs_err(NULL,
+		"device %s belongs to fsid %pU, and the fs is already mounted",
+				  path, fs_devices->fsid);
 			mutex_unlock(&fs_devices->device_list_mutex);
 			return ERR_PTR(-EBUSY);
 		}
@@ -905,6 +911,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 			 * generation are equal.
 			 */
 			mutex_unlock(&fs_devices->device_list_mutex);
+			btrfs_err(NULL,
+"device %s already registered with a higher generation, found %llu expect %llu",
+				  path, found_transid, device->generation);
 			return ERR_PTR(-EEXIST);
 		}
 

From aa4db4a74627357e97aee92f23b26d7077a6a97d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 10 Jan 2023 15:14:17 +0800
Subject: [PATCH 18/48] btrfs: qgroup: do not warn on record without old_roots
 populated

[BUG]
There are some reports from the mailing list that since v6.1 kernel, the
WARN_ON() inside btrfs_qgroup_account_extent() gets triggered during
rescan:

  WARNING: CPU: 3 PID: 6424 at fs/btrfs/qgroup.c:2756 btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
  CPU: 3 PID: 6424 Comm: snapperd Tainted: P           OE      6.1.2-1-default #1 openSUSE Tumbleweed 05c7a1b1b61d5627475528f71f50444637b5aad7
  RIP: 0010:btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
  Call Trace:
   <TASK>
  btrfs_commit_transaction+0x30c/0xb40 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
   ? start_transaction+0xc3/0x5b0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
  btrfs_qgroup_rescan+0x42/0xc0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
   btrfs_ioctl+0x1ab9/0x25c0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
   ? __rseq_handle_notify_resume+0xa9/0x4a0
   ? mntput_no_expire+0x4a/0x240
   ? __seccomp_filter+0x319/0x4d0
   __x64_sys_ioctl+0x90/0xd0
   do_syscall_64+0x5b/0x80
   ? syscall_exit_to_user_mode+0x17/0x40
   ? do_syscall_64+0x67/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd
  RIP: 0033:0x7fd9b790d9bf
   </TASK>

[CAUSE]
Since commit e15e9f43c7ca ("btrfs: introduce
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting"), if
our qgroup is already in inconsistent state, we will no longer do the
time-consuming backref walk.

This can leave some qgroup records without a valid old_roots ulist.
Normally this is fine, as btrfs_qgroup_account_extents() would also skip
those records if we have NO_ACCOUNTING flag set.

But there is a small window, if we have NO_ACCOUNTING flag set, and
inserted some qgroup_record without a old_roots ulist, but then the user
triggered a qgroup rescan.

During btrfs_qgroup_rescan(), we firstly clear NO_ACCOUNTING flag, then
commit current transaction.

And since we have a qgroup_record with old_roots = NULL, we trigger the
WARN_ON() during btrfs_qgroup_account_extents().

[FIX]
Unfortunately due to the introduction of NO_ACCOUNTING flag, the
assumption that every qgroup_record would have its old_roots populated
is no longer correct.

Fix the false alerts and drop the WARN_ON().

Reported-by: Lukas Straub <lukasstraub2@web.de>
Reported-by: HanatoK <summersnow9403@gmail.com>
Fixes: e15e9f43c7ca ("btrfs: introduce BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting")
CC: stable@vger.kernel.org # 6.1
Link: https://lore.kernel.org/linux-btrfs/2403c697-ddaf-58ad-3829-0335fc89df09@gmail.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d275bf24b250bd..00851c86aa8aac 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2765,9 +2765,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 
 			/*
 			 * Old roots should be searched when inserting qgroup
-			 * extent record
+			 * extent record.
+			 *
+			 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
+			 * we may have some record inserted during
+			 * NO_ACCOUNTING (thus no old_roots populated), but
+			 * later we start rescan, which clears NO_ACCOUNTING,
+			 * leaving some inserted records without old_roots
+			 * populated.
+			 *
+			 * Those cases are rare and should not cause too much
+			 * time spent during commit_transaction().
 			 */
-			if (WARN_ON(!record->old_roots)) {
+			if (!record->old_roots) {
 				/* Search commit root to find old_roots */
 				ret = btrfs_find_all_roots(&ctx, false);
 				if (ret < 0)

From 4c54a5bb366dcafeba22dea3db72ac63cee7a03f Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 10 Jan 2023 15:04:32 +0900
Subject: [PATCH 19/48] btrfs: zoned: enable metadata over-commit for non-ZNS
 setup

The commit 79417d040f4f ("btrfs: zoned: disable metadata overcommit for
zoned") disabled the metadata over-commit to track active zones properly.

However, it also introduced a heavy overhead by allocating new metadata
block groups and/or flushing dirty buffers to release the space
reservations. Specifically, a workload (write only without any sync
operations) worsen its performance from 343.77 MB/sec (v5.19) to 182.89
MB/sec (v6.0).

The performance is still bad on current misc-next which is 187.95 MB/sec.
And, with this patch applied, it improves back to 326.70 MB/sec (+73.82%).

This patch introduces a new fs_info->flag BTRFS_FS_NO_OVERCOMMIT to
indicate it needs to disable the metadata over-commit. The flag is enabled
when a device with max active zones limit is loaded into a file-system.

Fixes: 79417d040f4f ("btrfs: zoned: disable metadata overcommit for zoned")
CC: stable@vger.kernel.org # 6.0+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.h         | 6 ++++++
 fs/btrfs/space-info.c | 3 ++-
 fs/btrfs/zoned.c      | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a749367e5ae2a2..37b86acfcbcf88 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -119,6 +119,12 @@ enum {
 	/* Indicate that we want to commit the transaction. */
 	BTRFS_FS_NEED_TRANS_COMMIT,
 
+	/*
+	 * Indicate metadata over-commit is disabled. This is set when active
+	 * zone tracking is needed.
+	 */
+	BTRFS_FS_NO_OVERCOMMIT,
+
 #if BITS_PER_LONG == 32
 	/* Indicate if we have error/warn message printed on 32bit systems */
 	BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d28ee4e36f3d90..69c09508afb506 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -407,7 +407,8 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
 		return 0;
 
 	used = btrfs_space_info_used(space_info, true);
-	if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+	if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) &&
+	    (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
 		avail = 0;
 	else
 		avail = calc_available_free_space(fs_info, space_info, flush);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index bca9feb34c0cd4..d46701a77b1728 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -542,6 +542,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		}
 		atomic_set(&zone_info->active_zones_left,
 			   max_active_zones - nactive);
+		/* Overcommit does not work well with active zone tacking. */
+		set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags);
 	}
 
 	/* Validate superblock log */

From d284fe4927f949adbe0e05106a48b943b673ce81 Mon Sep 17 00:00:00 2001
From: Yushan Zhou <katrinzhou@tencent.com>
Date: Tue, 3 Jan 2023 13:11:37 +0800
Subject: [PATCH 20/48] btrfs: use PAGE_{ALIGN, ALIGNED, ALIGN_DOWN} macro

The header file linux/mm.h provides PAGE_ALIGN, PAGE_ALIGNED,
PAGE_ALIGN_DOWN macros. Use these macros to make code more
concise.

Signed-off-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 2 +-
 fs/btrfs/defrag.c      | 2 +-
 fs/btrfs/inode.c       | 5 ++---
 fs/btrfs/lzo.c         | 2 +-
 fs/btrfs/relocation.c  | 2 +-
 fs/btrfs/send.c        | 4 ++--
 6 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5122ca79f7ea4e..4a5aeb8dd4793a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1609,7 +1609,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
 	index_end = end >> PAGE_SHIFT;
 
 	/* Don't miss unaligned end */
-	if (!IS_ALIGNED(end, PAGE_SIZE))
+	if (!PAGE_ALIGNED(end))
 		index_end++;
 
 	curr_sample_pos = 0;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index d81b764a76446b..2737af7e278069 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 }
 
 #define CLUSTER_SIZE	(SZ_256K)
-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
 
 /*
  * Defrag one contiguous target range.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7fa1db6a474ab2..49a2e118f561f6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10995,9 +10995,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
 		return 0;
 
 	max_pages = sis->max - bsi->nr_pages;
-	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
-	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
-				PAGE_SIZE) >> PAGE_SHIFT;
+	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
+	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
 
 	if (first_ppage >= next_ppage)
 		return 0;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d5e78cbc8fbc77..71f6d8302d50e2 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		}
 
 		/* Check if we have reached page boundary */
-		if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+		if (PAGE_ALIGNED(cur_in)) {
 			put_page(page_in);
 			page_in = NULL;
 		}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 31ec4a7658ce6f..ef13a9d4e370f0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 	 *
 	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
 	 */
-	if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+	if (!PAGE_ALIGNED(i_size)) {
 		struct address_space *mapping = inode->vfs_inode.i_mapping;
 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
 		const u32 sectorsize = fs_info->sectorsize;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b90ad6f7219c1f..0b09ac5c447459 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5633,7 +5633,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
 	 * boundary in the send buffer. This means that there may be a gap
 	 * between the beginning of the command and the file data.
 	 */
-	data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+	data_offset = PAGE_ALIGN(sctx->send_size);
 	if (data_offset > sctx->send_max_size ||
 	    sctx->send_max_size - data_offset < disk_num_bytes) {
 		ret = -EOVERFLOW;
@@ -5757,7 +5757,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
 		sent += size;
 	}
 
-	if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
 		/*
 		 * Always operate only on ranges that are a multiple of the page
 		 * size. This is not only to prevent zeroing parts of a page in

From 18c16673092390d21d07bdb0c18dbfbd7712eff2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:34 +0000
Subject: [PATCH 21/48] btrfs: fix missing error handling when logging
 directory items

When logging a directory, at log_dir_items(), if we get an error when
attempting to search the subvolume tree for a dir index item, we end up
returning 0 (success) from log_dir_items() because 'err' is left with a
value of 0.

This can lead to a few problems, specially in the case the variable
'last_offset' has a value of (u64)-1 (and it's initialized to that when
it was declared):

1) By returning from log_dir_items() with success (0) and a value of
   (u64)-1 for '*last_offset_ret', we end up not logging any other dir
   index keys that follow the missing, just deleted, index key. The
   (u64)-1 value makes log_directory_changes() not call log_dir_items()
   again;

2) Before returning with success (0), log_dir_items(), will log a dir
   index range item covering a range from the last old dentry index
   (stored in the variable 'last_old_dentry_offset') to the value of
   'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
   if the log is persisted and replayed after a power failure, it will
   cause deletion of all the directory entries that have an index number
   between last_old_dentry_offset + 1 and (u64)-1;

3) We can end up returning from log_dir_items() with
   ctx->last_dir_item_offset having a lower value than
   inode->last_dir_index_offset, because the former is set to the current
   key we are processing at process_dir_items_leaf(), and at the end of
   log_directory_changes() we set inode->last_dir_index_offset to the
   current value of ctx->last_dir_item_offset. So if for example a
   deletion of a lower dir index key happened, we set
   ctx->last_dir_item_offset to that index value, then if we return from
   log_dir_items() because btrfs_search_slot() returned an error, we end up
   returning without any error from log_dir_items() and then
   log_directory_changes() sets inode->last_dir_index_offset to a lower
   value than it had before.
   This can result in unpredictable and unexpected behaviour when we
   need to log again the directory in the same transaction, and can result
   in ending up with a log tree leaf that has duplicated keys, as we do
   batch insertions of dir index keys into a log tree.

Fix this by setting 'err' to the value of 'ret' in case
btrfs_search_slot() or btrfs_previous_item() returned an error. That will
result in falling back to a full transaction commit.

Reported-by: David Arendt <admin@prnet.org>
Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
Fixes: e02119d5a7b4 ("Btrfs: Add a write ahead tree log to optimize synchronous operations")
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb52aa06009301..3ef0266e952740 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3826,7 +3826,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 					      path->slots[0]);
 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
 				last_old_dentry_offset = tmp.offset;
+		} else if (ret < 0) {
+			err = ret;
 		}
+
 		goto done;
 	}
 
@@ -3846,7 +3849,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		 */
 		if (tmp.type == BTRFS_DIR_INDEX_KEY)
 			last_old_dentry_offset = tmp.offset;
+	} else if (ret < 0) {
+		err = ret;
+		goto done;
 	}
+
 	btrfs_release_path(path);
 
 	/*
@@ -3859,6 +3866,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 */
 search:
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret < 0)
+		err = ret;
 	if (ret != 0)
 		goto done;
 

From 5c7eafa405c47851e6116296b235e2d0ab78ceb8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:35 +0000
Subject: [PATCH 22/48] btrfs: fix directory logging due to race with
 concurrent index key deletion

Sometimes we log a directory without holding its VFS lock, so while we
logging it, dir index entries may be added or removed. This typically
happens when logging a dentry from a parent directory that points to a
new directory, through log_new_dir_dentries(), or when while logging
some other inode we also need to log its parent directories (through
btrfs_log_all_parents()).

This means that while we are at log_dir_items(), we may not find a dir
index key we found before, because it was deleted in the meanwhile, so
a call to btrfs_search_slot() may return 1 (key not found). In that case
we return from log_dir_items() with a success value (the variable 'err'
has a value of 0). This can lead to a few problems, specially in the case
where the variable 'last_offset' has a value of (u64)-1 (and it's
initialized to that when it was declared):

1) By returning from log_dir_items() with success (0) and a value of
   (u64)-1 for '*last_offset_ret', we end up not logging any other dir
   index keys that follow the missing, just deleted, index key. The
   (u64)-1 value makes log_directory_changes() not call log_dir_items()
   again;

2) Before returning with success (0), log_dir_items(), will log a dir
   index range item covering a range from the last old dentry index
   (stored in the variable 'last_old_dentry_offset') to the value of
   'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
   if the log is persisted and replayed after a power failure, it will
   cause deletion of all the directory entries that have an index number
   between last_old_dentry_offset + 1 and (u64)-1;

3) We can end up returning from log_dir_items() with
   ctx->last_dir_item_offset having a lower value than
   inode->last_dir_index_offset, because the former is set to the current
   key we are processing at process_dir_items_leaf(), and at the end of
   log_directory_changes() we set inode->last_dir_index_offset to the
   current value of ctx->last_dir_item_offset. So if for example a
   deletion of a lower dir index key happened, we set
   ctx->last_dir_item_offset to that index value, then if we return from
   log_dir_items() because btrfs_search_slot() returned 1, we end up
   returning from log_dir_items() with success (0) and then
   log_directory_changes() sets inode->last_dir_index_offset to a lower
   value than it had before.
   This can result in unpredictable and unexpected behaviour when we
   need to log again the directory in the same transaction, and can result
   in ending up with a log tree leaf that has duplicated keys, as we do
   batch insertions of dir index keys into a log tree.

So fix this by making log_dir_items() move on to the next dir index key
if it does not find the one it was looking for.

Reported-by: David Arendt <admin@prnet.org>
Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3ef0266e952740..c09daab3f19e8f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3857,17 +3857,26 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	btrfs_release_path(path);
 
 	/*
-	 * Find the first key from this transaction again.  See the note for
-	 * log_new_dir_dentries, if we're logging a directory recursively we
-	 * won't be holding its i_mutex, which means we can modify the directory
-	 * while we're logging it.  If we remove an entry between our first
-	 * search and this search we'll not find the key again and can just
-	 * bail.
+	 * Find the first key from this transaction again or the one we were at
+	 * in the loop below in case we had to reschedule. We may be logging the
+	 * directory without holding its VFS lock, which happen when logging new
+	 * dentries (through log_new_dir_dentries()) or in some cases when we
+	 * need to log the parent directory of an inode. This means a dir index
+	 * key might be deleted from the inode's root, and therefore we may not
+	 * find it anymore. If we can't find it, just move to the next key. We
+	 * can not bail out and ignore, because if we do that we will simply
+	 * not log dir index keys that come after the one that was just deleted
+	 * and we can end up logging a dir index range that ends at (u64)-1
+	 * (@last_offset is initialized to that), resulting in removing dir
+	 * entries we should not remove at log replay time.
 	 */
 search:
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret > 0)
+		ret = btrfs_next_item(root, path);
 	if (ret < 0)
 		err = ret;
+	/* If ret is 1, there are no more keys in the inode's root. */
 	if (ret != 0)
 		goto done;
 

From db260d6a0abbe4eb7e9c29ca58a6d3b8dbebc74d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:36 +0000
Subject: [PATCH 23/48] btrfs: add missing setup of log for full commit at
 add_conflicting_inode()

When logging conflicting inodes, if we reach the maximum limit of inodes,
we return BTRFS_LOG_FORCE_COMMIT to force a transaction commit. However
we don't mark the log for full commit (with btrfs_set_log_full_commit()),
which means that once we leave the log transaction and before we commit
the transaction, some other task may sync the log, which is incomplete
as we have not logged all conflicting inodes, leading to some inconsistent
in case that log ends up being replayed.

So also call btrfs_set_log_full_commit() at add_conflicting_inode().

Fixes: e09d94c9e448 ("btrfs: log conflicting inodes without holding log mutex of the initial inode")
CC: stable@vger.kernel.org # 6.1
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c09daab3f19e8f..afad44a0becfe9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5598,8 +5598,10 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
 	 * commits.
 	 */
-	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
+		btrfs_set_log_full_commit(trans);
 		return BTRFS_LOG_FORCE_COMMIT;
+	}
 
 	inode = btrfs_iget(root->fs_info->sb, ino, root);
 	/*

From ab6e0ba0377df1fcfa06f0c56d63f550274fb66a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:37 +0000
Subject: [PATCH 24/48] btrfs: do not abort transaction on failure to write log
 tree when syncing log

When syncing the log, if we fail to write log tree extent buffers, we mark
the log for a full commit and abort the transaction. However we don't need
to abort the transaction, all we really need to do is to make sure no one
can commit a superblock pointing to new log tree roots. Just because we
got a failure writing extent buffers for a log tree, it does not mean we
will also fail to do a transaction commit.

One particular case is if due to a bug somewhere, when writing log tree
extent buffers, the tree checker detects some corruption and the writeout
fails because of that. Aborting the transaction can be very disruptive for
a user, specially if the issue happened on a root filesystem. One example
is the scenario in the Link tag below, where an isolated corruption on log
tree leaves was causing transaction aborts when syncing the log.

Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c  | 9 ++++++++-
 fs/btrfs/tree-log.c | 2 --
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7d5a3967ba0f3..7586a8e9b71892 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -367,7 +367,14 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
 	btrfs_print_tree(eb, 0);
 	btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
 		  eb->start);
-	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+	/*
+	 * Be noisy if this is an extent buffer from a log tree. We don't abort
+	 * a transaction in case there's a bad log tree extent buffer, we just
+	 * fallback to a transaction commit. Still we want to know when there is
+	 * a bad log tree extent buffer, as that may signal a bug somewhere.
+	 */
+	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+		btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
 	return ret;
 }
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index afad44a0becfe9..1f70d4ebffae4b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2980,7 +2980,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		ret = 0;
 	if (ret) {
 		blk_finish_plug(&plug);
-		btrfs_abort_transaction(trans, ret);
 		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&root->log_mutex);
 		goto out;
@@ -3112,7 +3111,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out_wake_log_root;
 	} else if (ret) {
 		btrfs_set_log_full_commit(trans);
-		btrfs_abort_transaction(trans, ret);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
 	}

From 40c7d0525fcd75b52b7a29602f8e511f01860a41 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:38 +0000
Subject: [PATCH 25/48] btrfs: do not abort transaction on failure to update
 log root

When syncing a log, if we fail to update a log root in the log root tree,
we are aborting the transaction if the failure was not -ENOSPC. This is
excessive because there is a chance that a transaction commit can succeed,
and therefore avoid to turn the filesystem into RO mode. All we need to be
careful about is to mark the log for a full commit, which we already do,
to make sure no one commits a super block pointing to an outdated log root
tree.

So don't abort the transaction if we fail to update a log root in the log
root tree, and log an error if the failure is not -ENOSPC, so that it does
not go completely unnoticed.

CC: stable@vger.kernel.org # 6.0+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1f70d4ebffae4b..d43261545264e3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3044,15 +3044,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 		blk_finish_plug(&plug);
 		btrfs_set_log_full_commit(trans);
-
-		if (ret != -ENOSPC) {
-			btrfs_abort_transaction(trans, ret);
-			mutex_unlock(&log_root_tree->log_mutex);
-			goto out;
-		}
+		if (ret != -ENOSPC)
+			btrfs_err(fs_info,
+				  "failed to update log for root %llu ret %d",
+				  root->root_key.objectid, ret);
 		btrfs_wait_tree_log_extents(log, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto out;
 	}
 

From 97d18e18344598ec47546483a273e34ff299446e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:39 +0000
Subject: [PATCH 26/48] btrfs: simplify update of last_dir_index_offset when
 logging a directory

When logging a directory, we always set the inode's last_dir_index_offset
to the offset of the last dir index item we found. This is using an extra
field in the log context structure, and it makes more sense to update it
only after we insert dir index items, and we could directly update the
inode's last_dir_index_offset field instead.

So make this simpler by updating the inode's last_dir_index_offset only
when we actually insert dir index keys in the log tree, and getting rid
of the last_dir_item_offset field in the log context structure.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 23 +++++++++++++++++------
 fs/btrfs/tree-log.h |  2 --
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d43261545264e3..58599189bd1886 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3576,17 +3576,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 }
 
 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *log,
+				 struct btrfs_inode *inode,
 				 struct extent_buffer *src,
 				 struct btrfs_path *dst_path,
 				 int start_slot,
 				 int count)
 {
+	struct btrfs_root *log = inode->root->log_root;
 	char *ins_data = NULL;
 	struct btrfs_item_batch batch;
 	struct extent_buffer *dst;
 	unsigned long src_offset;
 	unsigned long dst_offset;
+	u64 last_index;
 	struct btrfs_key key;
 	u32 item_size;
 	int ret;
@@ -3644,6 +3646,19 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
 	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
 	btrfs_release_path(dst_path);
+
+	last_index = batch.keys[count - 1].offset;
+	ASSERT(last_index > inode->last_dir_index_offset);
+
+	/*
+	 * If for some unexpected reason the last item's index is not greater
+	 * than the last index we logged, warn and return an error to fallback
+	 * to a transaction commit.
+	 */
+	if (WARN_ON(last_index <= inode->last_dir_index_offset))
+		ret = -EUCLEAN;
+	else
+		inode->last_dir_index_offset = last_index;
 out:
 	kfree(ins_data);
 
@@ -3693,7 +3708,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 		}
 
 		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
-		ctx->last_dir_item_offset = key.offset;
 
 		/*
 		 * Skip ranges of items that consist only of dir item keys created
@@ -3756,7 +3770,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	if (batch_size > 0) {
 		int ret;
 
-		ret = flush_dir_items_batch(trans, log, src, dst_path,
+		ret = flush_dir_items_batch(trans, inode, src, dst_path,
 					    batch_start, batch_size);
 		if (ret < 0)
 			return ret;
@@ -4044,7 +4058,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 
 	min_key = BTRFS_DIR_START_INDEX;
 	max_key = 0;
-	ctx->last_dir_item_offset = inode->last_dir_index_offset;
 
 	while (1) {
 		ret = log_dir_items(trans, inode, path, dst_path,
@@ -4056,8 +4069,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 		min_key = max_key + 1;
 	}
 
-	inode->last_dir_index_offset = ctx->last_dir_item_offset;
-
 	return 0;
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 85b43075ac58fb..85cd24cb0540d1 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -24,8 +24,6 @@ struct btrfs_log_ctx {
 	bool logging_new_delayed_dentries;
 	/* Indicate if the inode being logged was logged before. */
 	bool logged_before;
-	/* Tracks the last logged dir item/index key offset. */
-	u64 last_dir_item_offset;
 	struct inode *inode;
 	struct list_head list;
 	/* Only used for fast fsyncs. */

From b1b343de473c6f81a7bf4adbb4a75ee52e53eb1e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:40 +0000
Subject: [PATCH 27/48] btrfs: use a negative value for BTRFS_LOG_FORCE_COMMIT

Currently we use the value 1 for BTRFS_LOG_FORCE_COMMIT, but that value
has a few inconveniences:

1) If it's ever used by btrfs_log_inode(), or any function down the call
   chain, we have to remember to btrfs_set_log_full_commit(), which is
   repetitive and has a chance to be forgotten in future use cases.
   btrfs_log_inode_parent() only calls btrfs_set_log_full_commit() when
   it gets a negative value from btrfs_log_inode();

2) Down the call chain of btrfs_log_inode(), we may have functions that
   need to force a log commit, but can return either an error (negative
   value), false (0) or true (1). So they are forced to return some
   random negative to force a log commit - using BTRFS_LOG_FORCE_COMMIT
   would make the intention more clear. Currently the only example is
   flush_dir_items_batch().

So turn BTRFS_LOG_FORCE_COMMIT into a negative value. The chosen value
is -(MAX_ERRNO + 1), so that it does not overlap any errno value and makes
it easier to debug.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 10 +++-------
 fs/btrfs/tree-log.h |  9 +++++++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 58599189bd1886..94fc8b08254c17 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3652,11 +3652,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 
 	/*
 	 * If for some unexpected reason the last item's index is not greater
-	 * than the last index we logged, warn and return an error to fallback
-	 * to a transaction commit.
+	 * than the last index we logged, warn and force a transaction commit.
 	 */
 	if (WARN_ON(last_index <= inode->last_dir_index_offset))
-		ret = -EUCLEAN;
+		ret = BTRFS_LOG_FORCE_COMMIT;
 	else
 		inode->last_dir_index_offset = last_index;
 out:
@@ -5604,10 +5603,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
 	 * commits.
 	 */
-	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
-		btrfs_set_log_full_commit(trans);
+	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
 		return BTRFS_LOG_FORCE_COMMIT;
-	}
 
 	inode = btrfs_iget(root->fs_info->sb, ino, root);
 	/*
@@ -6466,7 +6463,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	 * result in losing the file after a log replay.
 	 */
 	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
-		btrfs_set_log_full_commit(trans);
 		ret = BTRFS_LOG_FORCE_COMMIT;
 		goto out_unlock;
 	}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 85cd24cb0540d1..bdeb5216718f4f 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -13,8 +13,13 @@
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
-/* We can't use the tree log for whatever reason, force a transaction commit */
-#define BTRFS_LOG_FORCE_COMMIT				(1)
+/*
+ * We can't use the tree log for whatever reason, force a transaction commit.
+ * We use a negative value because there are functions through the logging code
+ * that need to return an error (< 0 value), false (0) or true (1). Any negative
+ * value will do, as it will cause the log to be marked for a full sync.
+ */
+#define BTRFS_LOG_FORCE_COMMIT				(-(MAX_ERRNO + 1))
 
 struct btrfs_log_ctx {
 	int log_ret;

From 0e230588f1f01743b4af8a408289df5cacf80221 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 10 Jan 2023 14:56:41 +0000
Subject: [PATCH 28/48] btrfs: use a single variable to track return value for
 log_dir_items()

We currently use 'ret' and 'err' to track the return value for
log_dir_items(), which is confusing and likely the cause for previous
bugs where log_dir_items() did not return an error when it should, fixed
in previous patches.

So change this and use only a single variable, 'ret', to track the return
value. This is simpler and makes it similar to most of the existing code.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 94fc8b08254c17..997ba92481cb9e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3793,7 +3793,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	struct btrfs_key min_key;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_root *log = root->log_root;
-	int err = 0;
 	int ret;
 	u64 last_old_dentry_offset = min_offset - 1;
 	u64 last_offset = (u64)-1;
@@ -3834,8 +3833,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 					      path->slots[0]);
 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
 				last_old_dentry_offset = tmp.offset;
-		} else if (ret < 0) {
-			err = ret;
+		} else if (ret > 0) {
+			ret = 0;
 		}
 
 		goto done;
@@ -3858,7 +3857,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		if (tmp.type == BTRFS_DIR_INDEX_KEY)
 			last_old_dentry_offset = tmp.offset;
 	} else if (ret < 0) {
-		err = ret;
 		goto done;
 	}
 
@@ -3880,12 +3878,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 */
 search:
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (ret > 0)
+	if (ret > 0) {
 		ret = btrfs_next_item(root, path);
+		if (ret > 0) {
+			/* There are no more keys in the inode's root. */
+			ret = 0;
+			goto done;
+		}
+	}
 	if (ret < 0)
-		err = ret;
-	/* If ret is 1, there are no more keys in the inode's root. */
-	if (ret != 0)
 		goto done;
 
 	/*
@@ -3896,8 +3897,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
 					     &last_old_dentry_offset);
 		if (ret != 0) {
-			if (ret < 0)
-				err = ret;
+			if (ret > 0)
+				ret = 0;
 			goto done;
 		}
 		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
@@ -3908,10 +3909,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		 */
 		ret = btrfs_next_leaf(root, path);
 		if (ret) {
-			if (ret == 1)
+			if (ret == 1) {
 				last_offset = (u64)-1;
-			else
-				err = ret;
+				ret = 0;
+			}
 			goto done;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
@@ -3942,7 +3943,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
 
-	if (err == 0) {
+	if (ret == 0) {
 		*last_offset_ret = last_offset;
 		/*
 		 * In case the leaf was changed in the current transaction but
@@ -3953,15 +3954,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		 * a range, last_old_dentry_offset is == to last_offset.
 		 */
 		ASSERT(last_old_dentry_offset <= last_offset);
-		if (last_old_dentry_offset < last_offset) {
+		if (last_old_dentry_offset < last_offset)
 			ret = insert_dir_log_key(trans, log, path, ino,
 						 last_old_dentry_offset + 1,
 						 last_offset);
-			if (ret)
-				err = ret;
-		}
 	}
-	return err;
+
+	return ret;
 }
 
 /*

From bb3fa07737dc0599375d24820a98dade7996653f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 12 Dec 2022 08:12:43 +0100
Subject: [PATCH 29/48] btrfs: remove the wait argument to
 btrfs_start_ordered_extent

Given that wait is always set to 1, so remove the argument.
Last use of wait with 0 was in 0c304304feab ("Btrfs: remove
csum_bytes_left").

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c       |  2 +-
 fs/btrfs/file.c         |  2 +-
 fs/btrfs/inode.c        |  8 ++++----
 fs/btrfs/ordered-data.c | 25 +++++++++++--------------
 fs/btrfs/ordered-data.h |  2 +-
 5 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 2737af7e278069..8065341d831a17 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
 			break;
 
 		unlock_page(page);
-		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 		lock_page(page);
 		/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 834bbcb91102fd..74a433c64d728a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 				unlock_page(pages[i]);
 				put_page(pages[i]);
 			}
-			btrfs_start_ordered_extent(ordered, 1);
+			btrfs_start_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
 			return -EAGAIN;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 49a2e118f561f6..3c49742f0d4556 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2969,7 +2969,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 		unlock_extent(&inode->io_tree, page_start, page_end,
 			      &cached_state);
 		unlock_page(page);
-		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
@@ -4987,7 +4987,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 		unlock_extent(io_tree, block_start, block_end, &cached_state);
 		unlock_page(page);
 		put_page(page);
-		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
@@ -7392,7 +7392,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 			 */
 			if (writing ||
 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
-				btrfs_start_ordered_extent(ordered, 1);
+				btrfs_start_ordered_extent(ordered);
 			else
 				ret = nowait ? -EAGAIN : -ENOTBLK;
 			btrfs_put_ordered_extent(ordered);
@@ -8552,7 +8552,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 		unlock_extent(io_tree, page_start, page_end, &cached_state);
 		unlock_page(page);
 		up_read(&BTRFS_I(inode)->i_mmap_lock);
-		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 57d8c72737e1ad..6c24b69e2d0a37 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 	struct btrfs_ordered_extent *ordered;
 
 	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
-	btrfs_start_ordered_extent(ordered, 1);
+	btrfs_start_ordered_extent(ordered);
 	complete(&ordered->completion);
 }
 
@@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
 }
 
 /*
- * Used to start IO or wait for a given ordered extent to finish.
+ * Start IO and wait for a given ordered extent to finish.
  *
- * If wait is one, this effectively waits on page writeback for all the pages
- * in the extent, and it waits on the io completion code to insert
- * metadata into the btree corresponding to the extent
+ * Wait on page writeback for all the pages in the extent and the IO completion
+ * code to insert metadata into the btree corresponding to the extent.
  */
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
 {
 	u64 start = entry->file_offset;
 	u64 end = start + entry->num_bytes - 1;
@@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
 	 */
 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
 		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
-	if (wait) {
-		if (!freespace_inode)
-			btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
-		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-						 &entry->flags));
-	}
+
+	if (!freespace_inode)
+		btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
+	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
 }
 
 /*
@@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_start_ordered_extent(ordered);
 		end = ordered->file_offset;
 		/*
 		 * If the ordered extent had an error save the error but don't
@@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
 			break;
 		}
 		unlock_extent(&inode->io_tree, start, end, cachedp);
-		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_start_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 	}
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 89f82b78f590f3..ae3ed748acaf66 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -187,7 +187,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
 			   struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
 							 u64 file_offset);
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);

From 79fe19ac2bd4680014a2d9cf08a46bfe3ecfad08 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 12 Jan 2023 14:17:20 +0000
Subject: [PATCH 30/48] btrfs: fix invalid leaf access due to inline extent
 during lseek

During lseek, for SEEK_DATA and SEEK_HOLE modes, we access the disk_bytenr
of an extent without checking its type. However inline extents have their
data starting the offset of the disk_bytenr field, so accessing that field
when we have an inline extent can result in either of the following:

1) Interpret the inline extent's data as a disk_bytenr value;

2) In case the inline data is less than 8 bytes, we access part of some
   other item in the leaf, or unused space in the leaf;

3) In case the inline data is less than 8 bytes and the extent item is
   the first item in the leaf, we can access beyond the leaf's limit.

So fix this by not accessing the disk_bytenr field if we have an inline
extent.

Fixes: b6e833567ea1 ("btrfs: make hole and data seeking a lot more efficient")
Reported-by: Matthias Schoepfer <matthias.schoepfer@googlemail.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216908
Link: https://lore.kernel.org/linux-btrfs/7f25442f-b121-2a3a-5a3d-22bcaae83cd4@leemhuis.info/
CC: stable@vger.kernel.org # 6.1
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 74a433c64d728a..5cc5a1faaef5b5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3541,6 +3541,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
 		struct extent_buffer *leaf = path->nodes[0];
 		struct btrfs_file_extent_item *extent;
 		u64 extent_end;
+		u8 type;
 
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
@@ -3596,10 +3597,16 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
 
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_file_extent_item);
+		type = btrfs_file_extent_type(leaf, extent);
 
-		if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
-		    btrfs_file_extent_type(leaf, extent) ==
-		    BTRFS_FILE_EXTENT_PREALLOC) {
+		/*
+		 * Can't access the extent's disk_bytenr field if this is an
+		 * inline extent, since at that offset, it's where the extent
+		 * data starts.
+		 */
+		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+		    (type == BTRFS_FILE_EXTENT_REG &&
+		     btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
 			/*
 			 * Explicit hole or prealloc extent, search for delalloc.
 			 * A prealloc extent is treated like a hole.

From 0301484ab88cabcc5b640d7c862bae8e413a1edc Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 15 Dec 2022 16:06:31 -0800
Subject: [PATCH 31/48] btrfs: pass find_free_extent_ctl to allocator
 tracepoints

The allocator tracepoints currently have a pile of values from ffe_ctl.
In modifying the allocator and adding more tracepoints, I found myself
adding to the already long argument list of the tracepoints. It makes it
a lot simpler to just send in the ffe_ctl itself.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       | 92 +++---------------------------------
 fs/btrfs/extent-tree.h       | 75 +++++++++++++++++++++++++++++
 fs/btrfs/super.c             |  1 +
 include/trace/events/btrfs.h | 41 ++++++++--------
 4 files changed, 103 insertions(+), 106 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d1a4e51f8fbc1c..b26cbf6cae8b06 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -16,7 +16,8 @@
 #include <linux/percpu_counter.h>
 #include <linux/lockdep.h>
 #include <linux/crc32c.h>
-#include "misc.h"
+#include "ctree.h"
+#include "extent-tree.h"
 #include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -31,7 +32,6 @@
 #include "space-info.h"
 #include "block-rsv.h"
 #include "delalloc-space.h"
-#include "block-group.h"
 #include "discard.h"
 #include "rcu-string.h"
 #include "zoned.h"
@@ -3453,81 +3453,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
 	btrfs_put_block_group(cache);
 }
 
-enum btrfs_extent_allocation_policy {
-	BTRFS_EXTENT_ALLOC_CLUSTERED,
-	BTRFS_EXTENT_ALLOC_ZONED,
-};
-
-/*
- * Structure used internally for find_free_extent() function.  Wraps needed
- * parameters.
- */
-struct find_free_extent_ctl {
-	/* Basic allocation info */
-	u64 ram_bytes;
-	u64 num_bytes;
-	u64 min_alloc_size;
-	u64 empty_size;
-	u64 flags;
-	int delalloc;
-
-	/* Where to start the search inside the bg */
-	u64 search_start;
-
-	/* For clustered allocation */
-	u64 empty_cluster;
-	struct btrfs_free_cluster *last_ptr;
-	bool use_cluster;
-
-	bool have_caching_bg;
-	bool orig_have_caching_bg;
-
-	/* Allocation is called for tree-log */
-	bool for_treelog;
-
-	/* Allocation is called for data relocation */
-	bool for_data_reloc;
-
-	/* RAID index, converted from flags */
-	int index;
-
-	/*
-	 * Current loop number, check find_free_extent_update_loop() for details
-	 */
-	int loop;
-
-	/*
-	 * Whether we're refilling a cluster, if true we need to re-search
-	 * current block group but don't try to refill the cluster again.
-	 */
-	bool retry_clustered;
-
-	/*
-	 * Whether we're updating free space cache, if true we need to re-search
-	 * current block group but don't try updating free space cache again.
-	 */
-	bool retry_unclustered;
-
-	/* If current block group is cached */
-	int cached;
-
-	/* Max contiguous hole found */
-	u64 max_extent_size;
-
-	/* Total free space from free space cache, not always contiguous */
-	u64 total_free_space;
-
-	/* Found result */
-	u64 found_offset;
-
-	/* Hint where to start looking for an empty space */
-	u64 hint_byte;
-
-	/* Allocation policy */
-	enum btrfs_extent_allocation_policy policy;
-};
-
-
 /*
  * Helper function for find_free_extent().
  *
@@ -3559,8 +3484,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
 	if (offset) {
 		/* We have a block, we're done */
 		spin_unlock(&last_ptr->refill_lock);
-		trace_btrfs_reserve_extent_cluster(cluster_bg,
-				ffe_ctl->search_start, ffe_ctl->num_bytes);
+		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
 		*cluster_bg_ret = cluster_bg;
 		ffe_ctl->found_offset = offset;
 		return 0;
@@ -3610,10 +3534,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
 		if (offset) {
 			/* We found one, proceed */
 			spin_unlock(&last_ptr->refill_lock);
-			trace_btrfs_reserve_extent_cluster(bg,
-					ffe_ctl->search_start,
-					ffe_ctl->num_bytes);
 			ffe_ctl->found_offset = offset;
+			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
 			return 0;
 		}
 	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
@@ -4296,8 +4218,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 	ins->objectid = 0;
 	ins->offset = 0;
 
-	trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
-			       ffe_ctl->flags);
+	trace_find_free_extent(root, ffe_ctl);
 
 	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
 	if (!space_info) {
@@ -4468,8 +4389,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		ins->objectid = ffe_ctl->search_start;
 		ins->offset = ffe_ctl->num_bytes;
 
-		trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
-					   ffe_ctl->num_bytes);
+		trace_btrfs_reserve_extent(block_group, ffe_ctl);
 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
 		break;
 loop:
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index ae54252536031c..64fa8ad7914a74 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -3,6 +3,81 @@
 #ifndef BTRFS_EXTENT_TREE_H
 #define BTRFS_EXTENT_TREE_H
 
+#include "misc.h"
+#include "block-group.h"
+
+struct btrfs_free_cluster;
+
+enum btrfs_extent_allocation_policy {
+	BTRFS_EXTENT_ALLOC_CLUSTERED,
+	BTRFS_EXTENT_ALLOC_ZONED,
+};
+
+struct find_free_extent_ctl {
+	/* Basic allocation info */
+	u64 ram_bytes;
+	u64 num_bytes;
+	u64 min_alloc_size;
+	u64 empty_size;
+	u64 flags;
+	int delalloc;
+
+	/* Where to start the search inside the bg */
+	u64 search_start;
+
+	/* For clustered allocation */
+	u64 empty_cluster;
+	struct btrfs_free_cluster *last_ptr;
+	bool use_cluster;
+
+	bool have_caching_bg;
+	bool orig_have_caching_bg;
+
+	/* Allocation is called for tree-log */
+	bool for_treelog;
+
+	/* Allocation is called for data relocation */
+	bool for_data_reloc;
+
+	/* RAID index, converted from flags */
+	int index;
+
+	/*
+	 * Current loop number, check find_free_extent_update_loop() for details
+	 */
+	int loop;
+
+	/*
+	 * Whether we're refilling a cluster, if true we need to re-search
+	 * current block group but don't try to refill the cluster again.
+	 */
+	bool retry_clustered;
+
+	/*
+	 * Whether we're updating free space cache, if true we need to re-search
+	 * current block group but don't try updating free space cache again.
+	 */
+	bool retry_unclustered;
+
+	/* If current block group is cached */
+	int cached;
+
+	/* Max contiguous hole found */
+	u64 max_extent_size;
+
+	/* Total free space from free space cache, not always contiguous */
+	u64 total_free_space;
+
+	/* Found result */
+	u64 found_offset;
+
+	/* Hint where to start looking for an empty space */
+	u64 hint_byte;
+
+	/* Allocation policy */
+	enum btrfs_extent_allocation_policy policy;
+};
+
 enum btrfs_inline_ref_type {
 	BTRFS_REF_TYPE_INVALID,
 	BTRFS_REF_TYPE_BLOCK,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 433ce221dc5c79..e5136baef9af17 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -58,6 +58,7 @@
 #include "scrub.h"
 #include "verity.h"
 #include "super.h"
+#include "extent-tree.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 6548b5b5aa6080..8a422e29d06448 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -32,6 +32,7 @@ struct prelim_ref;
 struct btrfs_space_info;
 struct btrfs_raid_bio;
 struct raid56_bio_trace_info;
+struct find_free_extent_ctl;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -1241,38 +1242,38 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
 
 TRACE_EVENT(find_free_extent,
 
-	TP_PROTO(const struct btrfs_root *root, u64 num_bytes,
-		 u64 empty_size, u64 data),
+	TP_PROTO(const struct btrfs_root *root,
+		 const struct find_free_extent_ctl *ffe_ctl),
 
-	TP_ARGS(root, num_bytes, empty_size, data),
+	TP_ARGS(root, ffe_ctl),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	root_objectid		)
 		__field(	u64,	num_bytes		)
 		__field(	u64,	empty_size		)
-		__field(	u64,	data			)
+		__field(	u64,	flags			)
 	),
 
 	TP_fast_assign_btrfs(root->fs_info,
 		__entry->root_objectid	= root->root_key.objectid;
-		__entry->num_bytes	= num_bytes;
-		__entry->empty_size	= empty_size;
-		__entry->data		= data;
+		__entry->num_bytes	= ffe_ctl->num_bytes;
+		__entry->empty_size	= ffe_ctl->empty_size;
+		__entry->flags		= ffe_ctl->flags;
 	),
 
 	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
 		  show_root_type(__entry->root_objectid),
-		  __entry->num_bytes, __entry->empty_size, __entry->data,
-		  __print_flags((unsigned long)__entry->data, "|",
+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|",
 				 BTRFS_GROUP_FLAGS))
 );
 
 DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
-	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
-		 u64 len),
+	TP_PROTO(const struct btrfs_block_group *block_group,
+		 const struct find_free_extent_ctl *ffe_ctl),
 
-	TP_ARGS(block_group, start, len),
+	TP_ARGS(block_group, ffe_ctl),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	bg_objectid		)
@@ -1284,8 +1285,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 	TP_fast_assign_btrfs(block_group->fs_info,
 		__entry->bg_objectid	= block_group->start;
 		__entry->flags		= block_group->flags;
-		__entry->start		= start;
-		__entry->len		= len;
+		__entry->start		= ffe_ctl->search_start;
+		__entry->len		= ffe_ctl->num_bytes;
 	),
 
 	TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
@@ -1299,18 +1300,18 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
-	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
-		 u64 len),
+	TP_PROTO(const struct btrfs_block_group *block_group,
+		 const struct find_free_extent_ctl *ffe_ctl),
 
-	TP_ARGS(block_group, start, len)
+	TP_ARGS(block_group, ffe_ctl)
 );
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
-	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
-		 u64 len),
+	TP_PROTO(const struct btrfs_block_group *block_group,
+		 const struct find_free_extent_ctl *ffe_ctl),
 
-	TP_ARGS(block_group, start, len)
+	TP_ARGS(block_group, ffe_ctl)
 );
 
 TRACE_EVENT(btrfs_find_cluster,

From 0c2815cf2958966b3041b6167e6d5b56b798d190 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 15 Dec 2022 16:06:32 -0800
Subject: [PATCH 32/48] btrfs: add more find_free_extent tracepoints

find_free_extent is a complicated function. It consists (at least) of:

- a hint that jumps into the middle of a for loop macro
- a middle loop trying every raid level
- an outer loop ascending through ffe loop levels
- complicated logic for skipping some of those ffe loop levels
- multiple underlying in-bg allocators (zoned, cluster, no cluster)

Which is all to say that more tracing is helpful for debugging its
behavior. Add two new tracepoints: at the entrance to the block_groups
loop (hit for every raid level and every ffe_ctl loop) and at the point
we seriously consider a block_group for allocation. This way we can see
the whole path through the algorithm, including hints, multiple loops,
etc.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       |  4 ++
 fs/btrfs/extent-tree.h       |  3 ++
 include/trace/events/btrfs.h | 81 ++++++++++++++++++++++++++++++++++--
 3 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b26cbf6cae8b06..203c8cbb9983c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4261,6 +4261,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 							block_group->flags);
 				btrfs_lock_block_group(block_group,
 						       ffe_ctl->delalloc);
+				ffe_ctl->hinted = true;
 				goto have_block_group;
 			}
 		} else if (block_group) {
@@ -4268,6 +4269,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		}
 	}
 search:
+	trace_find_free_extent_search_loop(root, ffe_ctl);
 	ffe_ctl->have_caching_bg = false;
 	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
 	    ffe_ctl->index == 0)
@@ -4277,6 +4279,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 			    &space_info->block_groups[ffe_ctl->index], list) {
 		struct btrfs_block_group *bg_ret;
 
+		ffe_ctl->hinted = false;
 		/* If the block group is read-only, we can skip it entirely. */
 		if (unlikely(block_group->ro)) {
 			if (ffe_ctl->for_treelog)
@@ -4318,6 +4321,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		}
 
 have_block_group:
+		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
 		ffe_ctl->cached = btrfs_block_group_done(block_group);
 		if (unlikely(!ffe_ctl->cached)) {
 			ffe_ctl->have_caching_bg = true;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 64fa8ad7914a74..daa5e3505886e3 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -76,6 +76,9 @@ struct find_free_extent_ctl {
 
 	/* Allocation policy */
 	enum btrfs_extent_allocation_policy policy;
+
+	/* Whether or not the allocator is currently following a hint */
+	bool hinted;
 };
 
 enum btrfs_inline_ref_type {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8a422e29d06448..cc2eaab28d580c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1268,6 +1268,77 @@ TRACE_EVENT(find_free_extent,
 				 BTRFS_GROUP_FLAGS))
 );
 
+TRACE_EVENT(find_free_extent_search_loop,
+
+	TP_PROTO(const struct btrfs_root *root,
+		 const struct find_free_extent_ctl *ffe_ctl),
+
+	TP_ARGS(root, ffe_ctl),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	num_bytes		)
+		__field(	u64,	empty_size		)
+		__field(	u64,	flags			)
+		__field(	u64,	loop			)
+	),
+
+	TP_fast_assign_btrfs(root->fs_info,
+		__entry->root_objectid	= root->root_key.objectid;
+		__entry->num_bytes	= ffe_ctl->num_bytes;
+		__entry->empty_size	= ffe_ctl->empty_size;
+		__entry->flags		= ffe_ctl->flags;
+		__entry->loop		= ffe_ctl->loop;
+	),
+
+	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu",
+		  show_root_type(__entry->root_objectid),
+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
+		  __entry->loop)
+);
+
+TRACE_EVENT(find_free_extent_have_block_group,
+
+	TP_PROTO(const struct btrfs_root *root,
+		 const struct find_free_extent_ctl *ffe_ctl,
+		 const struct btrfs_block_group *block_group),
+
+	TP_ARGS(root, ffe_ctl, block_group),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	num_bytes		)
+		__field(	u64,	empty_size		)
+		__field(	u64,	flags			)
+		__field(	u64,	loop			)
+		__field(	bool,	hinted			)
+		__field(	u64,	bg_start		)
+		__field(	u64,	bg_flags		)
+	),
+
+	TP_fast_assign_btrfs(root->fs_info,
+		__entry->root_objectid	= root->root_key.objectid;
+		__entry->num_bytes	= ffe_ctl->num_bytes;
+		__entry->empty_size	= ffe_ctl->empty_size;
+		__entry->flags		= ffe_ctl->flags;
+		__entry->loop		= ffe_ctl->loop;
+		__entry->hinted		= ffe_ctl->hinted;
+		__entry->bg_start	= block_group->start;
+		__entry->bg_flags	= block_group->flags;
+	),
+
+	TP_printk_btrfs(
+"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)",
+		  show_root_type(__entry->root_objectid),
+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
+		  __entry->loop, __entry->hinted,
+		  __entry->bg_start, __entry->bg_flags,
+		  __print_flags((unsigned long)__entry->bg_flags, "|",
+				 BTRFS_GROUP_FLAGS))
+);
+
 DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
 	TP_PROTO(const struct btrfs_block_group *block_group,
@@ -1280,6 +1351,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 		__field(	u64,	flags			)
 		__field(	u64,	start			)
 		__field(	u64,	len			)
+		__field(	u64,	loop			)
+		__field(	bool,	hinted			)
 	),
 
 	TP_fast_assign_btrfs(block_group->fs_info,
@@ -1287,15 +1360,17 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 		__entry->flags		= block_group->flags;
 		__entry->start		= ffe_ctl->search_start;
 		__entry->len		= ffe_ctl->num_bytes;
+		__entry->loop		= ffe_ctl->loop;
+		__entry->hinted		= ffe_ctl->hinted;
 	),
 
-	TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
-		  "start=%llu len=%llu",
+	TP_printk_btrfs(
+"root=%llu(%s) block_group=%llu flags=%llu(%s) start=%llu len=%llu loop=%llu hinted=%d",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
 		  __entry->bg_objectid,
 		  __entry->flags, __print_flags((unsigned long)__entry->flags,
 						"|", BTRFS_GROUP_FLAGS),
-		  __entry->start, __entry->len)
+		  __entry->start, __entry->len, __entry->loop, __entry->hinted)
 );
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,

From 89f6e14228ff11ba1ea5a71a94dc9a6239882cdf Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 15 Dec 2022 16:06:33 -0800
Subject: [PATCH 33/48] btrfs: introduce size class to block group allocator

The aim of this patch is to reduce the fragmentation of block groups
under certain unhappy workloads. It is particularly effective when the
size of extents correlates with their lifetime, which is something we
have observed causing fragmentation in the fleet at Meta.

This patch categorizes extents into size classes:

- x < 128KiB: "small"
- 128KiB < x < 8MiB: "medium"
- x > 8MiB: "large"

and as much as possible reduces allocations of extents into block groups
that don't match the size class. This takes advantage of any (possible)
correlation between size and lifetime and also leaves behind predictable
re-usable gaps when extents are freed; small writes don't gum up bigger
holes.

Size classes are implemented in the following way:

- Mark each new block group with a size class of the first allocation
  that goes into it.

- Add two new passes to ffe: "unset size class" and "wrong size class".
  First, try only matching block groups, then try unset ones, then allow
  allocation of new ones, and finally allow mismatched block groups.

- Filtering is done just by skipping inappropriate ones, there is no
  special size class indexing.

Other solutions I considered were:

- A best fit allocator with an rb-tree. This worked well, as small
  writes didn't leak big holes from large freed extents, but led to
  regressions in ffe and write performance due to lock contention on
  the rb-tree with every allocation possibly updating it in parallel.
  Perhaps something clever could be done to do the updates in the
  background while being "right enough".

- A fixed size "working set". This prevents freeing an extent
  drastically changing where writes currently land, and seems like a
  good option too. Doesn't take advantage of size in any way.

- The same size class idea, but implemented with xarray marks. This
  turned out to be slower than looping the linked list and skipping
  wrong block groups, and is also less flexible since we must have only
  3 size classes (max #marks). With the current approach we can have as
  many as we like.

Performance testing was done via: https://github.com/josefbacik/fsperf
Of particular relevance are the new fragmentation specific tests.

A brief summary of the testing results:

- Neutral results on existing tests. There are some minor regressions
  and improvements here and there, but nothing that truly stands out as
  notable.
- Improvement on new tests where size class and extent lifetime are
  correlated. Fragmentation in these cases is completely eliminated
  and write performance is generally a little better. There is also
  significant improvement where extent sizes are just a bit larger than
  the size class boundaries.
- Regression on one new tests: where the allocations are sized
  intentionally a hair under the borders of the size classes. Results
  are neutral on the test that intentionally attacks this new scheme by
  mixing extent size and lifetime.

The full dump of the performance results can be found here:
https://bur.io/fsperf/size-class-2022-11-15.txt
(there are ANSI escape codes, so best to curl and view in terminal)

Here is a snippet from the full results for a new test which mixes
buffered writes appending to a long lived set of files and large short
lived fallocates:

bufferedappendvsfallocate results
         metric             baseline       current        stdev            diff
======================================================================================
avg_commit_ms                    31.13         29.20          2.67     -6.22%
bg_count                            14         15.60             0     11.43%
commits                          11.10         12.20          0.32      9.91%
elapsed                          27.30         26.40          2.98     -3.30%
end_state_mount_ns         11122551.90   10635118.90     851143.04     -4.38%
end_state_umount_ns           1.36e+09      1.35e+09   12248056.65     -1.07%
find_free_extent_calls       116244.30     114354.30        964.56     -1.63%
find_free_extent_ns_max      599507.20    1047168.20     103337.08     74.67%
find_free_extent_ns_mean       3607.19       3672.11        101.20      1.80%
find_free_extent_ns_min            500           512          6.67      2.40%
find_free_extent_ns_p50           2848          2876         37.65      0.98%
find_free_extent_ns_p95           4916          5000         75.45      1.71%
find_free_extent_ns_p99       20734.49      20920.48       1670.93      0.90%
frag_pct_max                     61.67             0          8.05   -100.00%
frag_pct_mean                    43.59             0          6.10   -100.00%
frag_pct_min                     25.91             0         16.60   -100.00%
frag_pct_p50                     42.53             0          7.25   -100.00%
frag_pct_p95                     61.67             0          8.05   -100.00%
frag_pct_p99                     61.67             0          8.05   -100.00%
fragmented_bg_count               6.10             0          1.45   -100.00%
max_commit_ms                    49.80            46          5.37     -7.63%
sys_cpu                           2.59          2.62          0.29      1.39%
write_bw_bytes                1.62e+08      1.68e+08   17975843.50      3.23%
write_clat_ns_mean            57426.39      54475.95       2292.72     -5.14%
write_clat_ns_p50             46950.40      42905.60       2101.35     -8.62%
write_clat_ns_p99            148070.40     143769.60       2115.17     -2.90%
write_io_kbytes                4194304       4194304             0      0.00%
write_iops                     2476.15       2556.10        274.29      3.23%
write_lat_ns_max            2101667.60    2251129.50     370556.59      7.11%
write_lat_ns_mean             59374.91      55682.00       2523.09     -6.22%
write_lat_ns_min              17353.10         16250       1646.08     -6.36%

There are some mixed improvements/regressions in most metrics along with
an elimination of fragmentation in this workload.

On the balance, the drastic 1->0 improvement in the happy cases seems
worth the mix of regressions and improvements we do observe.

Some considerations for future work:

- Experimenting with more size classes
- More hinting/search ordering work to approximate a best-fit allocator

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c       | 105 +++++++++++++++++++++++++++++------
 fs/btrfs/block-group.h       |  20 ++++++-
 fs/btrfs/extent-tree.c       |  71 +++++++++++------------
 fs/btrfs/extent-tree.h       |   3 +
 include/trace/events/btrfs.h |   9 ++-
 5 files changed, 155 insertions(+), 53 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e90800388a413f..6557b1b7f89ae2 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/sizes.h>
 #include <linux/list_sort.h>
 #include "misc.h"
 #include "ctree.h"
@@ -3379,6 +3380,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 			cache->space_info->disk_used -= num_bytes * factor;
 
 			reclaim = should_reclaim_block_group(cache, num_bytes);
+
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 
@@ -3433,32 +3435,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
  * reservation and return -EAGAIN, otherwise this function always succeeds.
  */
 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
-			     u64 ram_bytes, u64 num_bytes, int delalloc)
+			     u64 ram_bytes, u64 num_bytes, int delalloc,
+			     bool force_wrong_size_class)
 {
 	struct btrfs_space_info *space_info = cache->space_info;
+	enum btrfs_block_group_size_class size_class;
 	int ret = 0;
 
 	spin_lock(&space_info->lock);
 	spin_lock(&cache->lock);
 	if (cache->ro) {
 		ret = -EAGAIN;
-	} else {
-		cache->reserved += num_bytes;
-		space_info->bytes_reserved += num_bytes;
-		trace_btrfs_space_reservation(cache->fs_info, "space_info",
-					      space_info->flags, num_bytes, 1);
-		btrfs_space_info_update_bytes_may_use(cache->fs_info,
-						      space_info, -ram_bytes);
-		if (delalloc)
-			cache->delalloc_bytes += num_bytes;
+		goto out;
+	}
 
-		/*
-		 * Compression can use less space than we reserved, so wake
-		 * tickets if that happens
-		 */
-		if (num_bytes < ram_bytes)
-			btrfs_try_granting_tickets(cache->fs_info, space_info);
+	if (btrfs_is_block_group_data_only(cache)) {
+		size_class = btrfs_calc_block_group_size_class(num_bytes);
+		ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
+		if (ret)
+			goto out;
 	}
+	cache->reserved += num_bytes;
+	space_info->bytes_reserved += num_bytes;
+	trace_btrfs_space_reservation(cache->fs_info, "space_info",
+				      space_info->flags, num_bytes, 1);
+	btrfs_space_info_update_bytes_may_use(cache->fs_info,
+					      space_info, -ram_bytes);
+	if (delalloc)
+		cache->delalloc_bytes += num_bytes;
+
+	/*
+	 * Compression can use less space than we reserved, so wake tickets if
+	 * that happens.
+	 */
+	if (num_bytes < ram_bytes)
+		btrfs_try_granting_tickets(cache->fs_info, space_info);
+out:
 	spin_unlock(&cache->lock);
 	spin_unlock(&space_info->lock);
 	return ret;
@@ -4218,3 +4230,64 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount
 	bg->swap_extents -= amount;
 	spin_unlock(&bg->lock);
 }
+
+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
+{
+	if (size <= SZ_128K)
+		return BTRFS_BG_SZ_SMALL;
+	if (size <= SZ_8M)
+		return BTRFS_BG_SZ_MEDIUM;
+	return BTRFS_BG_SZ_LARGE;
+}
+
+/*
+ * Handle a block group allocating an extent in a size class
+ *
+ * @bg:				The block group we allocated in.
+ * @size_class:			The size class of the allocation.
+ * @force_wrong_size_class:	Whether we are desperate enough to allow
+ *				mismatched size classes.
+ *
+ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
+ * case of a race that leads to the wrong size class without
+ * force_wrong_size_class set.
+ *
+ * find_free_extent will skip block groups with a mismatched size class until
+ * it really needs to avoid ENOSPC. In that case it will set
+ * force_wrong_size_class. However, if a block group is newly allocated and
+ * doesn't yet have a size class, then it is possible for two allocations of
+ * different sizes to race and both try to use it. The loser is caught here and
+ * has to retry.
+ */
+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
+				     enum btrfs_block_group_size_class size_class,
+				     bool force_wrong_size_class)
+{
+	ASSERT(size_class != BTRFS_BG_SZ_NONE);
+
+	/* The new allocation is in the right size class, do nothing */
+	if (bg->size_class == size_class)
+		return 0;
+	/*
+	 * The new allocation is in a mismatched size class.
+	 * This means one of two things:
+	 *
+	 * 1. Two tasks in find_free_extent for different size_classes raced
+	 *    and hit the same empty block_group. Make the loser try again.
+	 * 2. A call to find_free_extent got desperate enough to set
+	 *    'force_wrong_slab'. Don't change the size_class, but allow the
+	 *    allocation.
+	 */
+	if (bg->size_class != BTRFS_BG_SZ_NONE) {
+		if (force_wrong_size_class)
+			return 0;
+		return -EAGAIN;
+	}
+	/*
+	 * The happy new block group case: the new allocation is the first
+	 * one in the block_group so we set size_class.
+	 */
+	bg->size_class = size_class;
+
+	return 0;
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index a02ea76fd6cffe..2b9d328b589eba 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -12,6 +12,17 @@ enum btrfs_disk_cache_state {
 	BTRFS_DC_SETUP,
 };
 
+enum btrfs_block_group_size_class {
+	/* Unset */
+	BTRFS_BG_SZ_NONE,
+	/* 0 < size <= 128K */
+	BTRFS_BG_SZ_SMALL,
+	/* 128K < size <= 8M */
+	BTRFS_BG_SZ_MEDIUM,
+	/* 8M < size < BG_LENGTH */
+	BTRFS_BG_SZ_LARGE,
+};
+
 /*
  * This describes the state of the block_group for async discard.  This is due
  * to the two pass nature of it where extent discarding is prioritized over
@@ -233,6 +244,7 @@ struct btrfs_block_group {
 	struct list_head active_bg_list;
 	struct work_struct zone_finish_work;
 	struct extent_buffer *last_eb;
+	enum btrfs_block_group_size_class size_class;
 };
 
 static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 			     u64 bytenr, u64 num_bytes, bool alloc);
 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
-			     u64 ram_bytes, u64 num_bytes, int delalloc);
+			     u64 ram_bytes, u64 num_bytes, int delalloc,
+			     bool force_wrong_size_class);
 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
 			       u64 num_bytes, int delalloc);
 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
@@ -346,4 +359,9 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
 
+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
+				     enum btrfs_block_group_size_class size_class,
+				     bool force_wrong_size_class);
+
 #endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 203c8cbb9983c1..e9be9430faa089 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3385,7 +3385,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 enum btrfs_loop_type {
 	LOOP_CACHING_NOWAIT,
 	LOOP_CACHING_WAIT,
+	LOOP_UNSET_SIZE_CLASS,
 	LOOP_ALLOC_CHUNK,
+	LOOP_WRONG_SIZE_CLASS,
 	LOOP_NO_EMPTY_SIZE,
 };
 
@@ -3950,24 +3952,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
 	}
 }
 
-static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
-{
-	switch (ffe_ctl->policy) {
-	case BTRFS_EXTENT_ALLOC_CLUSTERED:
-		/*
-		 * If we can't allocate a new chunk we've already looped through
-		 * at least once, move on to the NO_EMPTY_SIZE case.
-		 */
-		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
-		return 0;
-	case BTRFS_EXTENT_ALLOC_ZONED:
-		/* Give up here */
-		return -ENOSPC;
-	default:
-		BUG();
-	}
-}
-
 /*
  * Return >0 means caller needs to re-search for free extent
  * Return 0 means we have the needed free extent.
@@ -4001,31 +3985,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
 	 *			caching kthreads as we move along
 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+	 * LOOP_UNSET_SIZE_CLASS, allow unset size class
 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
 	 *		       again
 	 */
 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
 		ffe_ctl->index = 0;
-		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
-			/*
-			 * We want to skip the LOOP_CACHING_WAIT step if we
-			 * don't have any uncached bgs and we've already done a
-			 * full search through.
-			 */
-			if (ffe_ctl->orig_have_caching_bg || !full_search)
-				ffe_ctl->loop = LOOP_CACHING_WAIT;
-			else
-				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
-		} else {
+		/*
+		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
+		 * any uncached bgs and we've already done a full search
+		 * through.
+		 */
+		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
+		    (!ffe_ctl->orig_have_caching_bg && full_search))
 			ffe_ctl->loop++;
-		}
+		ffe_ctl->loop++;
 
 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
 			struct btrfs_trans_handle *trans;
 			int exist = 0;
 
-			/*Check if allocation policy allows to create a new chunk */
+			/* Check if allocation policy allows to create a new chunk */
 			ret = can_allocate_chunk(fs_info, ffe_ctl);
 			if (ret)
 				return ret;
@@ -4045,8 +4026,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 						CHUNK_ALLOC_FORCE_FOR_EXTENT);
 
 			/* Do not bail out on ENOSPC since we can do more. */
-			if (ret == -ENOSPC)
-				ret = chunk_allocation_failed(ffe_ctl);
+			if (ret == -ENOSPC) {
+				ret = 0;
+				ffe_ctl->loop++;
+			}
 			else if (ret < 0)
 				btrfs_abort_transaction(trans, ret);
 			else
@@ -4076,6 +4059,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 	return -ENOSPC;
 }
 
+static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
+					      struct btrfs_block_group *bg)
+{
+	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+		return true;
+	if (!btrfs_is_block_group_data_only(bg))
+		return true;
+	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+		return true;
+	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+	    bg->size_class == BTRFS_BG_SZ_NONE)
+		return true;
+	return ffe_ctl->size_class == bg->size_class;
+}
+
 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
 					struct find_free_extent_ctl *ffe_ctl,
 					struct btrfs_space_info *space_info,
@@ -4210,6 +4208,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 	ffe_ctl->total_free_space = 0;
 	ffe_ctl->found_offset = 0;
 	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
 
 	if (btrfs_is_zoned(fs_info))
 		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
@@ -4346,6 +4345,9 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
 			goto loop;
 
+		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
+			goto loop;
+
 		bg_ret = NULL;
 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
 		if (ret == 0) {
@@ -4380,7 +4382,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
 
 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
 					       ffe_ctl->num_bytes,
-					       ffe_ctl->delalloc);
+					       ffe_ctl->delalloc,
+					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
 		if (ret == -EAGAIN) {
 			btrfs_add_free_space_unused(block_group,
 					ffe_ctl->found_offset,
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index daa5e3505886e3..0c958fc1b3b8a1 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -79,6 +79,9 @@ struct find_free_extent_ctl {
 
 	/* Whether or not the allocator is currently following a hint */
 	bool hinted;
+
+	/* Size class of block groups to prefer in early loops */
+	enum btrfs_block_group_size_class size_class;
 };
 
 enum btrfs_inline_ref_type {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index cc2eaab28d580c..75d7d22c3a276c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1349,28 +1349,33 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	bg_objectid		)
 		__field(	u64,	flags			)
+		__field(	int,	bg_size_class		)
 		__field(	u64,	start			)
 		__field(	u64,	len			)
 		__field(	u64,	loop			)
 		__field(	bool,	hinted			)
+		__field(	int,	size_class		)
 	),
 
 	TP_fast_assign_btrfs(block_group->fs_info,
 		__entry->bg_objectid	= block_group->start;
 		__entry->flags		= block_group->flags;
+		__entry->bg_size_class	= block_group->size_class;
 		__entry->start		= ffe_ctl->search_start;
 		__entry->len		= ffe_ctl->num_bytes;
 		__entry->loop		= ffe_ctl->loop;
 		__entry->hinted		= ffe_ctl->hinted;
+		__entry->size_class	= ffe_ctl->size_class;
 	),
 
 	TP_printk_btrfs(
-"root=%llu(%s) block_group=%llu flags=%llu(%s) start=%llu len=%llu loop=%llu hinted=%d",
+"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d",
 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
 		  __entry->bg_objectid,
 		  __entry->flags, __print_flags((unsigned long)__entry->flags,
 						"|", BTRFS_GROUP_FLAGS),
-		  __entry->start, __entry->len, __entry->loop, __entry->hinted)
+		  __entry->bg_size_class, __entry->start, __entry->len,
+		  __entry->loop, __entry->hinted, __entry->size_class)
 );
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,

From 50f0e83935788d26b69565e92165f5e8d9b90bb8 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 15 Dec 2022 16:06:34 -0800
Subject: [PATCH 34/48] btrfs: load block group size class when caching

Since the size class is an artifact of an arbitrary anti fragmentation
strategy, it doesn't really make sense to persist it. Furthermore, most
of the size class logic assumes fresh block groups. That is of course
not a reasonable assumption -- we will be upgrading kernels with
existing filesystems whose block groups are not classified.

To work around those issues, implement logic to compute the size class
of the block groups as we cache them in. To perfectly assess the state
of a block group, we would have to read the entire extent tree (since
the free space cache mashes together contiguous extent items) which
would be prohibitively expensive for larger file systems with more
extents.

We can do it relatively cheaply by implementing a simple heuristic of
sampling a handful of extents and picking the smallest one we see. In
the happy case where the block group was classified, we will only see
extents of the correct size. In the unhappy case, we will hopefully find
one of the smaller extents, but there is no perfect answer anyway.
Autorelocation will eventually churn up the block group if there is
significant freeing anyway.

The work is done in the caching thread but after marking the block group
cached, as we tradeoff classification accuracy vs. slowing down
allocations.

There was no regression in mount performance at end state of the fsperf
test suite.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 130 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6557b1b7f89ae2..82e2fe33bb6e85 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -540,6 +540,134 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
 	return total_added;
 }
 
+/*
+ * Get an arbitrary extent item index / max_index through the block group
+ *
+ * @block_group   the block group to sample from
+ * @index:        the integral step through the block group to grab from
+ * @max_index:    the granularity of the sampling
+ * @key:          return value parameter for the item we find
+ *
+ * Pre-conditions on indices:
+ * 0 <= index <= max_index
+ * 0 < max_index
+ *
+ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
+ * error code on error.
+ */
+static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
+					  int index, int max_index,
+					  struct btrfs_key *key)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	struct btrfs_root *extent_root;
+	int ret = 0;
+	u64 search_offset;
+	struct btrfs_path *path;
+
+	ASSERT(index >= 0);
+	ASSERT(index <= max_index);
+	ASSERT(max_index > 0);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	down_read(&fs_info->commit_root_sem);
+	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
+						       BTRFS_SUPER_INFO_OFFSET));
+
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = READA_FORWARD;
+
+	search_offset = index * div_u64(block_group->length, max_index);
+	key->objectid = block_group->start + search_offset;
+	key->type = BTRFS_EXTENT_ITEM_KEY;
+	key->offset = 0;
+
+	ret = btrfs_search_slot(NULL, extent_root, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+	if (key->objectid < block_group->start ||
+	    key->objectid > block_group->start + block_group->length) {
+		ret = 1;
+		goto out;
+	}
+	if (key->type != BTRFS_EXTENT_ITEM_KEY) {
+		ret = 1;
+		goto out;
+	}
+out:
+	btrfs_free_path(path);
+	up_read(&fs_info->commit_root_sem);
+	return ret;
+}
+
+/*
+ * Best effort attempt to compute a block group's size class while caching it.
+ *
+ * @block_group: the block group we are caching
+ *
+ * We cannot infer the size class while adding free space extents, because that
+ * logic doesn't care about contiguous file extents (it doesn't differentiate
+ * between a 100M extent and 100 contiguous 1M extents). So we need to read the
+ * file extent items. Reading all of them is quite wasteful, because usually
+ * only a handful are enough to give a good answer. Therefore, we just grab 5 of
+ * them at even steps through the block group and pick the smallest size class
+ * we see. Since size class is best effort, and not guaranteed in general,
+ * inaccuracy is acceptable.
+ *
+ * To be more explicit about why this algorithm makes sense:
+ *
+ * If we are caching in a block group from disk, then there are three major cases
+ * to consider:
+ * 1. the block group is well behaved and all extents in it are the same size
+ *    class.
+ * 2. the block group is mostly one size class with rare exceptions for last
+ *    ditch allocations
+ * 3. the block group was populated before size classes and can have a totally
+ *    arbitrary mix of size classes.
+ *
+ * In case 1, looking at any extent in the block group will yield the correct
+ * result. For the mixed cases, taking the minimum size class seems like a good
+ * approximation, since gaps from frees will be usable to the size class. For
+ * 2., a small handful of file extents is likely to yield the right answer. For
+ * 3, we can either read every file extent, or admit that this is best effort
+ * anyway and try to stay fast.
+ *
+ * Returns: 0 on success, negative error code on error.
+ */
+static int load_block_group_size_class(struct btrfs_block_group *block_group)
+{
+	struct btrfs_key key;
+	int i;
+	u64 min_size = block_group->length;
+	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
+	int ret;
+
+	if (!btrfs_is_block_group_data_only(block_group))
+		return 0;
+
+	for (i = 0; i < 5; ++i) {
+		ret = sample_block_group_extent_item(block_group, i, 5, &key);
+		if (ret < 0)
+			goto out;
+		if (ret > 0)
+			continue;
+		min_size = min_t(u64, min_size, key.offset);
+		size_class = btrfs_calc_block_group_size_class(min_size);
+	}
+	if (size_class != BTRFS_BG_SZ_NONE) {
+		spin_lock(&block_group->lock);
+		block_group->size_class = size_class;
+		spin_unlock(&block_group->lock);
+	}
+
+out:
+	return ret;
+}
+
 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 {
 	struct btrfs_block_group *block_group = caching_ctl->block_group;
@@ -739,6 +867,8 @@ static noinline void caching_thread(struct btrfs_work *work)
 
 	wake_up(&caching_ctl->wait);
 
+	load_block_group_size_class(block_group);
+
 	btrfs_put_caching_control(caching_ctl);
 	btrfs_put_block_group(block_group);
 }

From 8720cb5b6606351fb534d15f86cd9701a0361841 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 15 Dec 2022 16:06:35 -0800
Subject: [PATCH 35/48] btrfs: dont use size classes for zoned file systems

When a file system has ZNS devices which are constrained by a maximum
number of active block groups, then not being able to use all the block
groups for every allocation is not ideal, and could cause us to loop a
ton with mixed size allocations.

In general, since zoned doesn't write into gaps behind where block
groups are writing, it is not susceptible to the same sort of
fragmentation that size classes are designed to solve, so we can skip
size classes for zoned file systems in general, even though there would
probably be no harm for SMR devices.

Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 13 +++++++++++--
 fs/btrfs/block-group.h |  1 +
 fs/btrfs/extent-tree.c |  2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 82e2fe33bb6e85..73e1270b3904c3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -646,7 +646,7 @@ static int load_block_group_size_class(struct btrfs_block_group *block_group)
 	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
 	int ret;
 
-	if (!btrfs_is_block_group_data_only(block_group))
+	if (btrfs_block_group_should_use_size_class(block_group))
 		return 0;
 
 	for (i = 0; i < 5; ++i) {
@@ -3579,7 +3579,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
 		goto out;
 	}
 
-	if (btrfs_is_block_group_data_only(cache)) {
+	if (btrfs_block_group_should_use_size_class(cache)) {
 		size_class = btrfs_calc_block_group_size_class(num_bytes);
 		ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
 		if (ret)
@@ -4421,3 +4421,12 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
 
 	return 0;
 }
+
+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
+{
+	if (btrfs_is_zoned(bg->fs_info))
+		return false;
+	if (!btrfs_is_block_group_data_only(bg))
+		return false;
+	return true;
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 2b9d328b589eba..4fdc39f00a4db5 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -363,5 +363,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
 int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
 				     enum btrfs_block_group_size_class size_class,
 				     bool force_wrong_size_class);
+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
 
 #endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e9be9430faa089..ad6c66ff9ba83a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4064,7 +4064,7 @@ static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_c
 {
 	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
 		return true;
-	if (!btrfs_is_block_group_data_only(bg))
+	if (!btrfs_block_group_should_use_size_class(bg))
 		return true;
 	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
 		return true;

From 062be155cb66f8491e9e7a8678351c498ec81fb3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sat, 17 Dec 2022 10:34:29 +0800
Subject: [PATCH 36/48] btrfs: scrub: improve tree block error reporting

[BUG]
When debugging a scrub related metadata error, it turns out that our
metadata error reporting is not ideal.

The only 3 error messages are:

- BTRFS error (device dm-2): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 0, gen 1
  Showing we have metadata generation mismatch errors.

- BTRFS error (device dm-2): unable to fixup (regular) error at logical 7110656 on dev /dev/mapper/test-scratch1
  Showing which tree blocks are corrupted.

- BTRFS warning (device dm-2): checksum/header error at logical 24772608 on dev /dev/mapper/test-scratch2, physical 3801088: metadata node (level 1) in tree 5
  Showing which physical range the corrupted metadata is at.

We have to combine the above 3 to know we have a corrupted metadata with
generation mismatch.

And this is already the better case, if we have other problems, like
fsid mismatch, we can not even know the cause.

[CAUSE]
The problem is caused by the fact that, scrub_checksum_tree_block()
never outputs any error message.

It just return two bits for scrub: sblock->header_error, and
sblock->generation_error.

And later we report error in scrub_print_warning(), but unfortunately we
only have two bits, there is not really much thing we can done to print
any detailed errors.

[FIX]
This patch will do the following to enhance the error reporting of
metadata scrub:

- Add extra warning (ratelimited) for every error we hit
  This can help us to distinguish the different types of errors.
  Some errors can help us to know what's going wrong immediately,
  like bytenr mismatch.

- Re-order the checks
  Currently we check bytenr first, then immediately generation.
  This can lead to false generation mismatch reports, while the fsid
  mismatches.

Here is the new output for the bug I'm debugging (we forgot to
writeback tree blocks for commit roots):

 BTRFS warning (device dm-2): tree block 24117248 mirror 1 has bad fsid, has b77cd862-f150-4c71-90ec-7baf0544d83f want 17df6abf-23cd-445f-b350-5b3e40bfd2fc
 BTRFS warning (device dm-2): tree block 24117248 mirror 0 has bad fsid, has b77cd862-f150-4c71-90ec-7baf0544d83f want 17df6abf-23cd-445f-b350-5b3e40bfd2fc

Now we can immediately know it's some tree blocks didn't even get written
back, other than the original confusing generation mismatch.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 49 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 52b346795f6606..10c26bc8e60e9b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	 * a) don't have an extent buffer and
 	 * b) the page is already kmapped
 	 */
-	if (sblock->logical != btrfs_stack_header_bytenr(h))
+	if (sblock->logical != btrfs_stack_header_bytenr(h)) {
 		sblock->header_error = 1;
-
-	if (sector->generation != btrfs_stack_header_generation(h)) {
-		sblock->header_error = 1;
-		sblock->generation_error = 1;
+		btrfs_warn_rl(fs_info,
+		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+			      sblock->logical, sblock->mirror_num,
+			      btrfs_stack_header_bytenr(h),
+			      sblock->logical);
+		goto out;
 	}
 
-	if (!scrub_check_fsid(h->fsid, sector))
+	if (!scrub_check_fsid(h->fsid, sector)) {
 		sblock->header_error = 1;
+		btrfs_warn_rl(fs_info,
+		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
+			      sblock->logical, sblock->mirror_num,
+			      h->fsid, sblock->dev->fs_devices->fsid);
+		goto out;
+	}
 
-	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
-		   BTRFS_UUID_SIZE))
+	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
 		sblock->header_error = 1;
+		btrfs_warn_rl(fs_info,
+		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+			      sblock->logical, sblock->mirror_num,
+			      h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+		goto out;
+	}
 
 	shash->tfm = fs_info->csum_shash;
 	crypto_shash_init(shash);
@@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	}
 
 	crypto_shash_final(shash, calculated_csum);
-	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
+	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
 		sblock->checksum_error = 1;
+		btrfs_warn_rl(fs_info,
+		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+			      sblock->logical, sblock->mirror_num,
+			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+		goto out;
+	}
+
+	if (sector->generation != btrfs_stack_header_generation(h)) {
+		sblock->header_error = 1;
+		sblock->generation_error = 1;
+		btrfs_warn_rl(fs_info,
+		"tree block %llu mirror %u has bad geneartion, has %llu want %llu",
+			      sblock->logical, sblock->mirror_num,
+			      btrfs_stack_header_generation(h),
+			      sector->generation);
+	}
 
+out:
 	return sblock->header_error || sblock->checksum_error;
 }
 

From c5e52b7f76adc6d75c1c2481c048077fcd2f7444 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 12 Jan 2023 16:31:08 +0000
Subject: [PATCH 37/48] btrfs: fix race between quota rescan and disable
 leading to NULL pointer deref

If we have one task trying to start the quota rescan worker while another
one is trying to disable quotas, we can end up hitting a race that results
in the quota rescan worker doing a NULL pointer dereference. The steps for
this are the following:

1) Quotas are enabled;

2) Task A calls the quota rescan ioctl and enters btrfs_qgroup_rescan().
   It calls qgroup_rescan_init() which returns 0 (success) and then joins a
   transaction and commits it;

3) Task B calls the quota disable ioctl and enters btrfs_quota_disable().
   It clears the bit BTRFS_FS_QUOTA_ENABLED from fs_info->flags and calls
   btrfs_qgroup_wait_for_completion(), which returns immediately since the
   rescan worker is not yet running.
   Then it starts a transaction and locks fs_info->qgroup_ioctl_lock;

4) Task A queues the rescan worker, by calling btrfs_queue_work();

5) The rescan worker starts, and calls rescan_should_stop() at the start
   of its while loop, which results in 0 iterations of the loop, since
   the flag BTRFS_FS_QUOTA_ENABLED was cleared from fs_info->flags by
   task B at step 3);

6) Task B sets fs_info->quota_root to NULL;

7) The rescan worker tries to start a transaction and uses
   fs_info->quota_root as the root argument for btrfs_start_transaction().
   This results in a NULL pointer dereference down the call chain of
   btrfs_start_transaction(). The stack trace is something like the one
   reported in Link tag below:

   general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] PREEMPT SMP KASAN
   KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f]
   CPU: 1 PID: 34 Comm: kworker/u4:2 Not tainted 6.1.0-syzkaller-13872-gb6bb9676f216 #0
   Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
   Workqueue: btrfs-qgroup-rescan btrfs_work_helper
   RIP: 0010:start_transaction+0x48/0x10f0 fs/btrfs/transaction.c:564
   Code: 48 89 fb 48 (...)
   RSP: 0018:ffffc90000ab7ab0 EFLAGS: 00010206
   RAX: 0000000000000041 RBX: 0000000000000208 RCX: ffff88801779ba80
   RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
   RBP: dffffc0000000000 R08: 0000000000000001 R09: fffff52000156f5d
   R10: fffff52000156f5d R11: 1ffff92000156f5c R12: 0000000000000000
   R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000003
   FS:  0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000
   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   CR2: 00007f2bea75b718 CR3: 000000001d0cc000 CR4: 00000000003506e0
   DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
   DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
   Call Trace:
    <TASK>
    btrfs_qgroup_rescan_worker+0x3bb/0x6a0 fs/btrfs/qgroup.c:3402
    btrfs_work_helper+0x312/0x850 fs/btrfs/async-thread.c:280
    process_one_work+0x877/0xdb0 kernel/workqueue.c:2289
    worker_thread+0xb14/0x1330 kernel/workqueue.c:2436
    kthread+0x266/0x300 kernel/kthread.c:376
    ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
    </TASK>
   Modules linked in:

So fix this by having the rescan worker function not attempt to start a
transaction if it didn't do any rescan work.

Reported-by: syzbot+96977faa68092ad382c4@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/000000000000e5454b05f065a803@google.com/
Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker")
CC: stable@vger.kernel.org # 5.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 00851c86aa8aac..af97413abcf43b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3367,6 +3367,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	int err = -ENOMEM;
 	int ret = 0;
 	bool stopped = false;
+	bool did_leaf_rescans = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -3387,6 +3388,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 		}
 
 		err = qgroup_rescan_leaf(trans, path);
+		did_leaf_rescans = true;
 
 		if (err > 0)
 			btrfs_commit_transaction(trans);
@@ -3407,16 +3409,23 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
 	/*
-	 * only update status, since the previous part has already updated the
-	 * qgroup info.
+	 * Only update status, since the previous part has already updated the
+	 * qgroup info, and only if we did any actual work. This also prevents
+	 * race with a concurrent quota disable, which has already set
+	 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
+	 * btrfs_quota_disable().
 	 */
-	trans = btrfs_start_transaction(fs_info->quota_root, 1);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
+	if (did_leaf_rescans) {
+		trans = btrfs_start_transaction(fs_info->quota_root, 1);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			btrfs_err(fs_info,
+				  "fail to start transaction for status update: %d",
+				  err);
+		}
+	} else {
 		trans = NULL;
-		btrfs_err(fs_info,
-			  "fail to start transaction for status update: %d",
-			  err);
 	}
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);

From afe06a15d7069d606a0e76424bf5c3a2dddc49b3 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 12 Jan 2023 16:05:11 -0800
Subject: [PATCH 38/48] btrfs: hold block group refcount during async discard

Async discard does not acquire the block group reference count while it
holds a reference on the discard list. This is generally OK, as the
paths which destroy block groups tend to try to synchronize on
cancelling async discard work. However, relying on cancelling work
requires careful analysis to be sure it is safe from races with
unpinning scheduling more work.

While I am unable to find a race with unpinning in the current code for
either the unused bgs or relocation paths, I believe we have one in an
older version of auto relocation in a Meta internal build. This suggests
that this is in fact an error prone model, and could be fragile to
future changes to these bg deletion paths.

To make this ownership more clear, add a refcount for async discard. If
work is queued for a block group, its refcount should be incremented,
and when work is completed or canceled, it should be decremented.

CC: stable@vger.kernel.org # 5.15+
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/discard.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index ff2e524d993773..6931616fea4749 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -89,6 +89,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 						      BTRFS_DISCARD_DELAY);
 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
 	}
+	if (list_empty(&block_group->discard_list))
+		btrfs_get_block_group(block_group);
 
 	list_move_tail(&block_group->discard_list,
 		       get_discard_list(discard_ctl, block_group));
@@ -108,8 +110,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
 				       struct btrfs_block_group *block_group)
 {
+	bool queued;
+
 	spin_lock(&discard_ctl->lock);
 
+	queued = !list_empty(&block_group->discard_list);
+
 	if (!btrfs_run_discard_work(discard_ctl)) {
 		spin_unlock(&discard_ctl->lock);
 		return;
@@ -121,6 +127,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
 	block_group->discard_eligible_time = (ktime_get_ns() +
 					      BTRFS_DISCARD_UNUSED_DELAY);
 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+	if (!queued)
+		btrfs_get_block_group(block_group);
 	list_add_tail(&block_group->discard_list,
 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
 
@@ -131,6 +139,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
 				     struct btrfs_block_group *block_group)
 {
 	bool running = false;
+	bool queued = false;
 
 	spin_lock(&discard_ctl->lock);
 
@@ -140,7 +149,10 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
 	}
 
 	block_group->discard_eligible_time = 0;
+	queued = !list_empty(&block_group->discard_list);
 	list_del_init(&block_group->discard_list);
+	if (queued && !running)
+		btrfs_put_block_group(block_group);
 
 	spin_unlock(&discard_ctl->lock);
 
@@ -214,10 +226,12 @@ static struct btrfs_block_group *peek_discard_list(
 	if (block_group && now >= block_group->discard_eligible_time) {
 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
 		    block_group->used != 0) {
-			if (btrfs_is_block_group_data_only(block_group))
+			if (btrfs_is_block_group_data_only(block_group)) {
 				__add_to_discard_list(discard_ctl, block_group);
-			else
+			} else {
 				list_del_init(&block_group->discard_list);
+				btrfs_put_block_group(block_group);
+			}
 			goto again;
 		}
 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
@@ -511,6 +525,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
 	spin_lock(&discard_ctl->lock);
 	discard_ctl->prev_discard = trimmed;
 	discard_ctl->prev_discard_time = now;
+	if (list_empty(&block_group->discard_list))
+		btrfs_put_block_group(block_group);
 	discard_ctl->block_group = NULL;
 	__btrfs_discard_schedule_work(discard_ctl, now, false);
 	spin_unlock(&discard_ctl->lock);
@@ -651,8 +667,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
 				 bg_list) {
 		list_del_init(&block_group->bg_list);
-		btrfs_put_block_group(block_group);
 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+		/*
+		 * This put is for the get done by btrfs_mark_bg_unused.
+		 * Queueing discard incremented it for discard's reference.
+		 */
+		btrfs_put_block_group(block_group);
 	}
 	spin_unlock(&fs_info->unused_bgs_lock);
 }
@@ -683,6 +703,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
 			if (block_group->used == 0)
 				btrfs_mark_bg_unused(block_group);
 			spin_lock(&discard_ctl->lock);
+			btrfs_put_block_group(block_group);
 		}
 	}
 	spin_unlock(&discard_ctl->lock);

From 5ac1d4549f350fbf8a1762e47c8fd358d2295db2 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 22 Feb 2022 10:20:14 +0900
Subject: [PATCH 39/48] debug dump active bgs

---
 fs/btrfs/extent-tree.c | 44 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ad6c66ff9ba83a..124b435859dccf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3656,6 +3656,44 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
  *     fs_info::treelog_bg_lock
  */
 
+static void dump_zoned_block_groups(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_group *block_group;
+	struct btrfs_device *device;
+	int n = 0;
+
+	btrfs_info(fs_info, "Dump active block groups");
+	spin_lock(&fs_info->zone_active_bgs_lock);
+	list_for_each_entry(block_group, &fs_info->zone_active_bgs,
+			    active_bg_list) {
+		spin_lock(&block_group->lock);
+		btrfs_info(fs_info,
+			   "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
+			   block_group->start, block_group->length, block_group->used, block_group->pinned,
+			   block_group->reserved, block_group->zone_unusable,
+			   block_group->ro ? "[readonly]" : "");
+		btrfs_info(fs_info, "free space %llu active %d flags %llu",
+			   block_group->zone_capacity - block_group->alloc_offset,
+			   test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags),
+			   block_group->flags);
+		spin_unlock(&block_group->lock);
+		n += block_group->physical_map->num_stripes;
+	}
+	spin_unlock(&fs_info->zone_active_bgs_lock);
+	btrfs_info(fs_info, "nactive in list = %d", n);
+
+	list_for_each_entry (device, &fs_info->fs_devices->devices, dev_list) {
+		if (!device->bdev)
+			continue;
+		if (!device->zone_info)
+			continue;
+		btrfs_info(fs_info, " dev active zones %d bitmap %d",
+			   atomic_read(&device->zone_info->active_zones_left),
+			   bitmap_weight(device->zone_info->active_zones,
+					 device->zone_info->nr_zones));
+	}
+}
+
 /*
  * Simple allocator for sequential-only block group. It only allows sequential
  * allocation. No need to play with trees. This function also reserves the
@@ -3723,6 +3761,12 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
 		 * Return the error after taking the locks.
 		 */
+		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+			pr_info("dupm max_extent_size %llu min_alloc_size %llu num_bytes %llu\n",
+				ffe_ctl->max_extent_size, ffe_ctl->min_alloc_size,
+				ffe_ctl->num_bytes);
+			dump_zoned_block_groups(fs_info);
+		}
 	}
 
 	spin_lock(&space_info->lock);

From 0bace7daff2ec7ebd5749985c59887b717365f06 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 7 Mar 2022 11:27:46 +0900
Subject: [PATCH 40/48] btrfs: optimize fully used

---
 fs/btrfs/extent-tree.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ad6c66ff9ba83a..36d7928820a975 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3679,6 +3679,14 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 
 	ASSERT(btrfs_is_zoned(block_group->fs_info));
 
+	if (block_group->alloc_offset == block_group->zone_capacity) {
+		if (ffe_ctl->for_treelog)
+			btrfs_clear_treelog_bg(block_group);
+		if (ffe_ctl->for_data_reloc)
+			btrfs_clear_data_reloc_bg(block_group);
+		return 1;
+	}
+
 	/*
 	 * Do not allow non-tree-log blocks in the dedicated tree-log block
 	 * group, and vice versa.

From 6416be156764fc9f0a28e5b379876a36860037b9 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 10 Mar 2022 22:28:22 +0900
Subject: [PATCH 41/48] optimize hint bytes

---
 fs/btrfs/extent-tree.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 36d7928820a975..2fc19222d1871d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4148,12 +4148,24 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
 			if (fs_info->treelog_bg)
 				ffe_ctl->hint_byte = fs_info->treelog_bg;
 			spin_unlock(&fs_info->treelog_bg_lock);
-		}
-		if (ffe_ctl->for_data_reloc) {
+		} else if (ffe_ctl->for_data_reloc) {
 			spin_lock(&fs_info->relocation_bg_lock);
 			if (fs_info->data_reloc_bg)
 				ffe_ctl->hint_byte = fs_info->data_reloc_bg;
 			spin_unlock(&fs_info->relocation_bg_lock);
+		} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+			struct btrfs_block_group *block_group;
+
+			down_read(&space_info->groups_sem);
+			list_for_each_entry_reverse(block_group,
+					    &space_info->block_groups[ffe_ctl->index], list) {
+				if (block_group->alloc_offset < block_group->zone_capacity &&
+				    block_group->zone_is_active) {
+					ffe_ctl->hint_byte = block_group->start;
+					break;
+				}
+			}
+			up_read(&space_info->groups_sem);
 		}
 		return 0;
 	default:

From f25c3ff492a501ff49aaf131f96528ba27dbc565 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 23 Jun 2022 23:27:29 +0900
Subject: [PATCH 42/48] fixup optimize hint byte

---
 fs/btrfs/extent-tree.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2fc19222d1871d..11559f53a1470d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4156,16 +4156,23 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
 		} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
 			struct btrfs_block_group *block_group;
 
-			down_read(&space_info->groups_sem);
-			list_for_each_entry_reverse(block_group,
-					    &space_info->block_groups[ffe_ctl->index], list) {
-				if (block_group->alloc_offset < block_group->zone_capacity &&
-				    block_group->zone_is_active) {
+			spin_lock(&fs_info->zone_active_bgs_lock);
+			list_for_each_entry(block_group, &fs_info->zone_active_bgs,
+					    active_bg_list) {
+				/*
+				 * No lock is OK here because avail is
+				 * monotinically decreasing, and this is
+				 * just a hint.
+				 */
+				u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+				if (block_group_bits(block_group, ffe_ctl->flags) &&
+				    avail >= ffe_ctl->num_bytes) {
 					ffe_ctl->hint_byte = block_group->start;
 					break;
 				}
 			}
-			up_read(&space_info->groups_sem);
+			spin_unlock(&fs_info->zone_active_bgs_lock);
 		}
 		return 0;
 	default:

From 9f5cd305c1215bb1776434433d014b2d40223007 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 10 Nov 2022 19:54:56 +0900
Subject: [PATCH 43/48] bucket list

---
 fs/btrfs/block-group.c |  6 ++++++
 fs/btrfs/block-group.h |  1 +
 fs/btrfs/disk-io.c     |  4 ++++
 fs/btrfs/extent-tree.c | 37 ++++++++++++++++++++++++-------------
 fs/btrfs/fs.h          |  4 ++++
 fs/btrfs/zoned.c       | 13 +++++++++++++
 fs/btrfs/zoned.h       | 17 +++++++++++++++++
 7 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 73e1270b3904c3..8cfbfcbfbe2fd6 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1090,6 +1090,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&trans->transaction->dirty_bgs_lock);
 	mutex_unlock(&trans->transaction->cache_write_mutex);
 
+	spin_lock(&fs_info->zone_alloc_list_lock);
+	if (!list_empty(&block_group->zoned_alloc_list))
+		list_del_init(&block_group->zoned_alloc_list);
+	spin_unlock(&fs_info->zone_alloc_list_lock);
+
 	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
 	if (ret)
 		goto out;
@@ -2123,6 +2128,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
 	INIT_LIST_HEAD(&cache->dirty_list);
 	INIT_LIST_HEAD(&cache->io_list);
 	INIT_LIST_HEAD(&cache->active_bg_list);
+	INIT_LIST_HEAD(&cache->zoned_alloc_list);
 	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
 	atomic_set(&cache->frozen, 0);
 	mutex_init(&cache->free_space_lock);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 4fdc39f00a4db5..264f7d969bb220 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -245,6 +245,7 @@ struct btrfs_block_group {
 	struct work_struct zone_finish_work;
 	struct extent_buffer *last_eb;
 	enum btrfs_block_group_size_class size_class;
+	struct list_head zoned_alloc_list;
 };
 
 static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7586a8e9b71892..20d7780cbc1975 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3062,6 +3062,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 
 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 {
+	int i;
+
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
@@ -3109,6 +3111,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&fs_info->unused_bgs);
 	INIT_LIST_HEAD(&fs_info->reclaim_bgs);
 	INIT_LIST_HEAD(&fs_info->zone_active_bgs);
+	for (i = 0; i < BTRFS_NUM_ZONE_ALLOC_LIST; i++)
+		INIT_LIST_HEAD(&fs_info->zone_alloc_list[i]);
 #ifdef CONFIG_BTRFS_DEBUG
 	INIT_LIST_HEAD(&fs_info->allocated_roots);
 	INIT_LIST_HEAD(&fs_info->allocated_ebs);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 11559f53a1470d..7f81ad9f03403a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3676,6 +3676,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 	u64 data_reloc_bytenr;
 	int ret = 0;
 	bool skip = false;
+	int cur_bucket, new_bucket;
 
 	ASSERT(btrfs_is_zoned(block_group->fs_info));
 
@@ -3801,6 +3802,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 	ctl->free_space -= num_bytes;
 	spin_unlock(&ctl->tree_lock);
 
+	cur_bucket = btrfs_zoned_alloc_bucket(fs_info, avail);
+	new_bucket = btrfs_zoned_alloc_bucket(fs_info, avail - num_bytes);
+	if (cur_bucket != new_bucket) {
+		spin_lock(&fs_info->zone_alloc_list_lock);
+		if (new_bucket == -1)
+			list_del_init(&block_group->zoned_alloc_list);
+		else
+			list_move_tail(&block_group->zoned_alloc_list,
+				       &fs_info->zone_alloc_list[new_bucket]);
+		spin_unlock(&fs_info->zone_alloc_list_lock);
+	}
+
 	/*
 	 * We do not check if found_offset is aligned to stripesize. The
 	 * address is anyway rewritten when using zone append writing.
@@ -4155,24 +4168,22 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
 			spin_unlock(&fs_info->relocation_bg_lock);
 		} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
 			struct btrfs_block_group *block_group;
+			int bucket = btrfs_zoned_alloc_bucket(fs_info, ffe_ctl->num_bytes);
+			int i;
 
-			spin_lock(&fs_info->zone_active_bgs_lock);
-			list_for_each_entry(block_group, &fs_info->zone_active_bgs,
-					    active_bg_list) {
-				/*
-				 * No lock is OK here because avail is
-				 * monotinically decreasing, and this is
-				 * just a hint.
-				 */
-				u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+			spin_lock(&fs_info->zone_alloc_list_lock);
+			for (i = bucket; i < BTRFS_NUM_ZONE_ALLOC_LIST; i++) {
+				list_for_each_entry(block_group, &fs_info->zone_alloc_list[i],
+						    zoned_alloc_list) {
+					if (!block_group_bits(block_group, ffe_ctl->flags))
+						continue;
 
-				if (block_group_bits(block_group, ffe_ctl->flags) &&
-				    avail >= ffe_ctl->num_bytes) {
 					ffe_ctl->hint_byte = block_group->start;
-					break;
+					spin_unlock(&fs_info->zone_alloc_list_lock);
+					return  0;
 				}
 			}
-			spin_unlock(&fs_info->zone_active_bgs_lock);
+			spin_unlock(&fs_info->zone_alloc_list_lock);
 		}
 		return 0;
 	default:
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf88..bf48b182b4368c 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -349,6 +349,7 @@ struct btrfs_commit_stats {
 	u64 total_commit_dur;
 };
 
+#define BTRFS_NUM_ZONE_ALLOC_LIST 8
 struct btrfs_fs_info {
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	unsigned long flags;
@@ -761,6 +762,9 @@ struct btrfs_fs_info {
 	spinlock_t zone_active_bgs_lock;
 	struct list_head zone_active_bgs;
 
+	spinlock_t zone_alloc_list_lock;
+	struct list_head zone_alloc_list[BTRFS_NUM_ZONE_ALLOC_LIST];
+
 	/* Updates are not protected by any lock */
 	struct btrfs_commit_stats commit_stats;
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index d46701a77b1728..ff393c88657ec9 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1551,6 +1551,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 	if (!ret) {
+		u64 avail = cache->zone_capacity - cache->alloc_offset;
+		int bucket = btrfs_zoned_alloc_bucket(fs_info, avail);
+
 		cache->meta_write_pointer = cache->alloc_offset + cache->start;
 		if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
 			btrfs_get_block_group(cache);
@@ -1559,6 +1562,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 				      &fs_info->zone_active_bgs);
 			spin_unlock(&fs_info->zone_active_bgs_lock);
 		}
+		if (bucket != -1) {
+			spin_lock(&fs_info->zone_alloc_list_lock);
+			list_add_tail(&cache->zoned_alloc_list,
+				      &fs_info->zone_alloc_list[bucket]);
+			spin_unlock(&fs_info->zone_alloc_list_lock);
+		}
 	} else {
 		kfree(cache->physical_map);
 		cache->physical_map = NULL;
@@ -2041,6 +2050,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 	block_group->free_space_ctl->free_space = 0;
 	btrfs_clear_treelog_bg(block_group);
 	btrfs_clear_data_reloc_bg(block_group);
+	spin_lock(&fs_info->zone_alloc_list_lock);
+	if (!list_empty(&block_group->zoned_alloc_list))
+		list_del_init(&block_group->zoned_alloc_list);
+	spin_unlock(&fs_info->zone_alloc_list_lock);
 	spin_unlock(&block_group->lock);
 
 	map = block_group->physical_map;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index f43990985d802c..df9cb47a0dd2f2 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -418,4 +418,21 @@ static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
 	return (bg->alloc_offset == bg->zone_capacity);
 }
 
+static inline int btrfs_zoned_alloc_bucket(struct btrfs_fs_info *fs_info, u64 avail)
+{
+	u64 bucket_size;
+	unsigned long bucket_size_shift;
+
+	if (avail == 0)
+		return -1;
+	if (avail == fs_info->zone_size)
+		return BTRFS_NUM_ZONE_ALLOC_LIST - 1;
+
+	// unsigned long max_bit = __ffs64(fs_info->zone_size);
+	// unsigned long first_bit = __ffs64(avail);
+	bucket_size = fs_info->zone_size >> const_ilog2(BTRFS_NUM_ZONE_ALLOC_LIST);
+	bucket_size_shift = ilog2(bucket_size);
+	return avail >> bucket_size_shift;
+}
+
 #endif

From e5801efc35239f66dcded16201d1d73898416b72 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 13 Jan 2022 15:55:09 +0900
Subject: [PATCH 44/48] btrfs: zoned: split reclaim

---
 fs/btrfs/block-group.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 73e1270b3904c3..b0d3377581ecbf 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1830,6 +1830,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 
 next:
 		btrfs_put_block_group(bg);
+
+		/* let other operations to go */
+		mutex_unlock(&fs_info->reclaim_bgs_lock);
+		sb_end_write(fs_info->sb);
+		btrfs_exclop_finish(fs_info);
+
+		cond_resched();
+
+		if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+			return;
+		sb_start_write(fs_info->sb);
+		mutex_lock(&fs_info->reclaim_bgs_lock);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
 	spin_unlock(&fs_info->unused_bgs_lock);

From 297bf21ff2270f518a97509dbfd043aea9048688 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 3 Oct 2022 11:12:47 +0900
Subject: [PATCH 45/48] temp-fix

---
 fs/btrfs/space-info.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 69c09508afb506..653e8183723c8b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -382,6 +382,8 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
 				       struct btrfs_space_info *space_info)
 {
+	u64 treelog = 0;
+
 	/*
 	 * On regular filesystem, all total_bytes are always writable. On zoned
 	 * filesystem, there may be a limitation imposed by max_active_zones.
@@ -392,7 +394,10 @@ static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
 	if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
 		return space_info->total_bytes;
 
-	return space_info->active_total_bytes;
+	if (fs_info->treelog_bg)
+		treelog = fs_info->zone_size;
+
+	return space_info->active_total_bytes - treelog;
 }
 
 int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,

From 32aff0df300ba7939fb8eb61890d8317479856e3 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 6 Jan 2023 15:49:09 +0900
Subject: [PATCH 46/48] btrfs: zoned: use dedicated callback to populate
 device->zone_info

Currently, btrfs_get_dev_zone_info() uses "zones" to copy the zone
info into it and process the copied info in the loop. We can replace
the loop with a dedicated callback for blkdev_report_zones(), and we
can populate device->zone_info on the fly.

We can also copy the zone info of superblock log zones into
zone_info->sb_zones array, reducing the number of REPORT ZONE commands.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/zoned.c | 193 +++++++++++++++++++++--------------------------
 1 file changed, 88 insertions(+), 105 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index d46701a77b1728..8d0e6251762c41 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -353,17 +353,90 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
+static int populate_zone_info(struct blk_zone *zone, unsigned int idx,
+			      void *data)
+{
+	struct btrfs_device *device = data;
+	struct btrfs_zoned_device_info *zone_info = device->zone_info;
+	int i;
+
+	/* Populate bitmaps and track active zones */
+	if (zone->type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+		__set_bit(idx, zone_info->seq_zones);
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EMPTY:
+		__set_bit(idx, zone_info->empty_zones);
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		__set_bit(idx, zone_info->active_zones);
+		if (zone_info->max_active_zones &&
+		    atomic_dec_return(&zone_info->active_zones_left) < 0) {
+			btrfs_err_in_rcu(device->fs_info,
+		 "zoned: active zones on %s exceeds max_active_zones %u",
+					 rcu_str_deref(device->name),
+					 zone_info->max_active_zones);
+			return -EIO;
+		}
+		/* Overcommit does not work well with active zone tacking. */
+		if (zone_info->max_active_zones)
+			set_bit(BTRFS_FS_NO_OVERCOMMIT, &device->fs_info->flags);
+		break;
+	}
+
+	/* Copy and validate superblock zones */
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		int sb_pos;
+		u32 sb_zone;
+		struct blk_zone *sb_zones;
+
+		sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+		if (idx < sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= idx)
+			continue;
+
+		sb_pos = BTRFS_NR_SB_LOG_ZONES * i + (idx - sb_zone);
+		sb_zones = &zone_info->sb_zones[sb_pos];
+		memcpy(sb_zones, zone, sizeof(*zone));
+
+		if (idx == sb_zone + BTRFS_NR_SB_LOG_ZONES - 1) {
+			u64 sb_wp;
+			int ret;
+
+			/*
+			 * If sb_zones[0] is conventional, always use the
+			 * beginning of the zone to record superblock. No need
+			 * to validate in that case.
+			 */
+			if (sb_zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL)
+				continue;
+
+			ret = sb_write_pointer(device->bdev, sb_zones, &sb_wp);
+			if (ret != -ENOENT && ret) {
+				btrfs_err_in_rcu(
+					device->fs_info,
+		"zoned: super block log zone corrupted devid %llu zone %u",
+					device->devid, sb_zone);
+				return -EUCLEAN;
+			}
+		}
+	}
+
+	/* Populate zone cache */
+	if (zone_info->zone_cache)
+		memcpy(&zone_info->zone_cache[idx], zone, sizeof(*zone));
+
+	return 0;
+}
+
 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_zoned_device_info *zone_info = NULL;
 	struct block_device *bdev = device->bdev;
 	unsigned int max_active_zones;
-	unsigned int nactive;
 	sector_t nr_sectors;
-	sector_t sector = 0;
-	struct blk_zone *zones = NULL;
-	unsigned int i, nreported = 0, nr_zones;
 	sector_t zone_sectors;
 	char *model, *emulated;
 	int ret;
@@ -452,6 +525,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		goto out;
 	}
 	zone_info->max_active_zones = max_active_zones;
+	atomic_set(&zone_info->active_zones_left, max_active_zones);
 
 	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 	if (!zone_info->seq_zones) {
@@ -471,12 +545,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		goto out;
 	}
 
-	zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
-	if (!zones) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * Enable zone cache only for a zoned device. On a non-zoned device, we
 	 * fill the zone info with emulated CONVENTIONAL zones, so no need to
@@ -494,106 +562,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		}
 	}
 
-	/* Get zones type */
-	nactive = 0;
-	while (sector < nr_sectors) {
-		nr_zones = BTRFS_REPORT_NR_ZONES;
-		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
-					  &nr_zones);
-		if (ret)
+	if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
+		/* Report all the zones and popoulate zone_info's structures */
+		ret = blkdev_report_zones(device->bdev, 0, BLK_ALL_ZONES,
+					  populate_zone_info, device);
+		if (ret < 0)
 			goto out;
 
-		for (i = 0; i < nr_zones; i++) {
-			if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
-				__set_bit(nreported, zone_info->seq_zones);
-			switch (zones[i].cond) {
-			case BLK_ZONE_COND_EMPTY:
-				__set_bit(nreported, zone_info->empty_zones);
-				break;
-			case BLK_ZONE_COND_IMP_OPEN:
-			case BLK_ZONE_COND_EXP_OPEN:
-			case BLK_ZONE_COND_CLOSED:
-				__set_bit(nreported, zone_info->active_zones);
-				nactive++;
-				break;
-			}
-			nreported++;
-		}
-		sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
-	}
-
-	if (nreported != zone_info->nr_zones) {
-		btrfs_err_in_rcu(device->fs_info,
-				 "inconsistent number of zones on %s (%u/%u)",
-				 rcu_str_deref(device->name), nreported,
-				 zone_info->nr_zones);
-		ret = -EIO;
-		goto out;
-	}
-
-	if (max_active_zones) {
-		if (nactive > max_active_zones) {
+		if (ret != zone_info->nr_zones) {
 			btrfs_err_in_rcu(device->fs_info,
-			"zoned: %u active zones on %s exceeds max_active_zones %u",
-					 nactive, rcu_str_deref(device->name),
-					 max_active_zones);
+					 "inconsistent number of zones on %s (%u/%u)",
+					 rcu_str_deref(device->name), ret,
+					 zone_info->nr_zones);
 			ret = -EIO;
 			goto out;
 		}
-		atomic_set(&zone_info->active_zones_left,
-			   max_active_zones - nactive);
-		/* Overcommit does not work well with active zone tacking. */
-		set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags);
 	}
 
-	/* Validate superblock log */
-	nr_zones = BTRFS_NR_SB_LOG_ZONES;
-	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-		u32 sb_zone;
-		u64 sb_wp;
-		int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
-
-		sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
-		if (sb_zone + 1 >= zone_info->nr_zones)
-			continue;
-
-		ret = btrfs_get_dev_zones(device,
-					  zone_start_physical(sb_zone, zone_info),
-					  &zone_info->sb_zones[sb_pos],
-					  &nr_zones);
-		if (ret)
-			goto out;
-
-		if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
-			btrfs_err_in_rcu(device->fs_info,
-	"zoned: failed to read super block log zone info at devid %llu zone %u",
-					 device->devid, sb_zone);
-			ret = -EUCLEAN;
-			goto out;
-		}
-
-		/*
-		 * If zones[0] is conventional, always use the beginning of the
-		 * zone to record superblock. No need to validate in that case.
-		 */
-		if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
-		    BLK_ZONE_TYPE_CONVENTIONAL)
-			continue;
-
-		ret = sb_write_pointer(device->bdev,
-				       &zone_info->sb_zones[sb_pos], &sb_wp);
-		if (ret != -ENOENT && ret) {
-			btrfs_err_in_rcu(device->fs_info,
-			"zoned: super block log zone corrupted devid %llu zone %u",
-					 device->devid, sb_zone);
-			ret = -EUCLEAN;
-			goto out;
-		}
-	}
-
-
-	kvfree(zones);
-
 	switch (bdev_zoned_model(bdev)) {
 	case BLK_ZONED_HM:
 		model = "host-managed zoned";
@@ -613,7 +598,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 				 bdev_zoned_model(bdev),
 				 rcu_str_deref(device->name));
 		ret = -EOPNOTSUPP;
-		goto out_free_zone_info;
+		goto out;
 	}
 
 	btrfs_info_in_rcu(fs_info,
@@ -624,8 +609,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	return 0;
 
 out:
-	kvfree(zones);
-out_free_zone_info:
 	btrfs_destroy_dev_zone_info(device);
 
 	return ret;

From f4749794b9711de6f652697ff34b7dbd10863aa0 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Mon, 20 Dec 2021 11:28:32 +0900
Subject: [PATCH 47/48] btrfs: zoned: trust zone cache if available

Now that we populate the zone_cache using the dedicated callback
populate_zone_info(), we can remove the cache population code from
btrfs_get_dev_zones(). And, as we cache all the zone info at
once (with BLK_ALL_ZONES), we can always trust the cache if it is
available.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/zoned.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 8d0e6251762c41..e924101560ad99 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -233,7 +233,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 
 	/* Check cache */
 	if (zinfo->zone_cache) {
-		unsigned int i;
 		u32 zno;
 
 		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
@@ -244,20 +243,9 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 		 */
 		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
 
-		for (i = 0; i < *nr_zones; i++) {
-			struct blk_zone *zone_info;
-
-			zone_info = &zinfo->zone_cache[zno + i];
-			if (!zone_info->len)
-				break;
-		}
-
-		if (i == *nr_zones) {
-			/* Cache hit on all the zones */
-			memcpy(zones, zinfo->zone_cache + zno,
-			       sizeof(*zinfo->zone_cache) * *nr_zones);
-			return 0;
-		}
+		memcpy(zones, zinfo->zone_cache + zno,
+		       sizeof(*zinfo->zone_cache) * *nr_zones);
+		return 0;
 	}
 
 	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
@@ -273,14 +261,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 	if (!ret)
 		return -EIO;
 
-	/* Populate cache */
-	if (zinfo->zone_cache) {
-		u32 zno = pos >> zinfo->zone_size_shift;
-
-		memcpy(zinfo->zone_cache + zno, zones,
-		       sizeof(*zinfo->zone_cache) * *nr_zones);
-	}
-
 	return 0;
 }
 

From 90ba3167ebcb0b3112951d1c2aa3d921634f2a19 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 24 Nov 2021 15:00:14 +0900
Subject: [PATCH 48/48] btrfs: zoned: reduce the memory footprint of zone cache

Currently, btrfs caches the full struct blk_zone, which takes 64 bytes
per zone, including 28 reserved bytes. However, we can derive some
fields from members of struct btrfs_zoned_device_info. While the cache
is only enabled during the mount process, it is still a waste of
memory. For example, on a drive with 57,000 zones, it takes 3.5 MB.

This commit introduces struct cached_blk_zone to store the necessary
information (wp and capacity) from struct blk_zone. This reduces the
memory footprint to 24 bytes per zone, making the total cache size to
be 1.3 MB in the above example.
---
 fs/btrfs/zoned.c | 50 ++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/zoned.h |  3 ++-
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index e924101560ad99..03172b7ba1082c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -18,6 +18,13 @@
 #include "fs.h"
 #include "accessors.h"
 
+/* Used to cache the necessary information from blk_zone */
+struct cached_zone {
+	u64 wp;
+	u64 capacity;
+	u8  cond;
+};
+
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
 /* Invalid allocation pointer value for missing devices */
@@ -220,6 +227,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	struct cached_zone *zone_cache;
 	int ret;
 
 	if (!*nr_zones)
@@ -234,6 +242,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 	/* Check cache */
 	if (zinfo->zone_cache) {
 		u32 zno;
+		const u8 zone_sectors_shift = zinfo->zone_size_shift - SECTOR_SHIFT;
+		unsigned int i;
 
 		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
 		zno = pos >> zinfo->zone_size_shift;
@@ -243,8 +253,26 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 		 */
 		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
 
-		memcpy(zones, zinfo->zone_cache + zno,
-		       sizeof(*zinfo->zone_cache) * *nr_zones);
+		for (i = 0; i < *nr_zones; i++) {
+			zone_cache = &zinfo->zone_cache[zno + i];
+
+			zones[i].start = (__u64)(zno + i) << zone_sectors_shift;
+			zones[i].len = (__u64)1 << zone_sectors_shift;
+			zones[i].wp = zone_cache->wp;
+			zones[i].cond = zone_cache->cond;
+			zones[i].capacity = zone_cache->capacity;
+
+			/*
+			 * We don't distinguish SEQWRITE_PREF vs SEQWRITE_REQ in
+			 * zoned btrfs, so we can safely set
+			 * BLK_ZONE_TYPE_SEQWRITE_REQ.
+			 */
+			if (test_bit(zno + i, zinfo->seq_zones))
+				zones[i].type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+			else
+				zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
+		}
+
 		return 0;
 	}
 
@@ -404,8 +432,13 @@ static int populate_zone_info(struct blk_zone *zone, unsigned int idx,
 	}
 
 	/* Populate zone cache */
-	if (zone_info->zone_cache)
-		memcpy(&zone_info->zone_cache[idx], zone, sizeof(*zone));
+	if (zone_info->zone_cache) {
+		struct cached_zone *cache = &zone_info->zone_cache[idx];
+
+		cache->wp = zone->wp;
+		cache->cond = zone->cond;
+		cache->capacity = zone->capacity;
+	}
 
 	return 0;
 }
@@ -531,8 +564,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	 * use the cache.
 	 */
 	if (populate_cache && bdev_is_zoned(device->bdev)) {
-		zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
-						zone_info->nr_zones);
+		zone_info->zone_cache = kvmalloc_array(zone_info->nr_zones,
+						       sizeof(struct cached_zone),
+						       GFP_KERNEL);
 		if (!zone_info->zone_cache) {
 			btrfs_err_in_rcu(device->fs_info,
 				"zoned: failed to allocate zone cache for %s",
@@ -604,7 +638,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 	bitmap_free(zone_info->active_zones);
 	bitmap_free(zone_info->seq_zones);
 	bitmap_free(zone_info->empty_zones);
-	vfree(zone_info->zone_cache);
+	kvfree(zone_info->zone_cache);
 	kfree(zone_info);
 	device->zone_info = NULL;
 }
@@ -2161,7 +2195,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->zone_info) {
-			vfree(device->zone_info->zone_cache);
+			kvfree(device->zone_info->zone_cache);
 			device->zone_info->zone_cache = NULL;
 		}
 	}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index f43990985d802c..7b24f93097467d 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -13,6 +13,7 @@
 
 #define BTRFS_DEFAULT_RECLAIM_THRESH           			(75)
 
+struct cached_zone;
 struct btrfs_zoned_device_info {
 	/*
 	 * Number of zones, zone size and types of zones if bdev is a
@@ -27,7 +28,7 @@ struct btrfs_zoned_device_info {
 	unsigned long *seq_zones;
 	unsigned long *empty_zones;
 	unsigned long *active_zones;
-	struct blk_zone *zone_cache;
+	struct cached_zone *zone_cache;
 	struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
 };