From 6d0b031e1521ea79508b58d7808bdf419e47966f Mon Sep 17 00:00:00 2001 From: Forza Date: Mon, 7 Apr 2025 15:25:19 +0200 Subject: [PATCH] Btrfs allocator hints: v3 with RAID1 read policy --- Btrfs/Allocator Hints/README.md | 5 +- .../btrfs_allocator_hints-6.12_v3.patch | 1594 +++++++++++++++++ 2 files changed, 1597 insertions(+), 2 deletions(-) create mode 100644 Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v3.patch diff --git a/Btrfs/Allocator Hints/README.md b/Btrfs/Allocator Hints/README.md index 7ae43c2..4c4c01a 100644 --- a/Btrfs/Allocator Hints/README.md +++ b/Btrfs/Allocator Hints/README.md @@ -15,7 +15,8 @@ The patches in this repository are named according to the kernel version they ar - **btrfs_allocator_hints-6.6_v3.patch**: Kernel version 6.6 with further tweaks - **btrfs_allocator_hints-6.6_v4.patch**: Latest patch for kernel version 6.6 - **btrfs_allocator_hints-6.12_v1.patch**: For kernel version 6.12 -- **btrfs_allocator_hints-6.12_v2.patch**: Latest patch for kernel version 6.12 +- **btrfs_allocator_hints-6.12_v2.patch**: Second iteration for kernel version 6.12 +- **btrfs_allocator_hints-6.12_v3.patch**: Latest patch for kernel version 6.12 with RAID1 read balance policy Each patch is tailored to work with its respective kernel version. Ensure you match the patch version to your Linux kernel version to avoid compatibility issues. @@ -36,4 +37,4 @@ The licensing of these patches has not been explicitly stated. However, given th If you have any doubts or require confirmation, it is recommended to contact the original authors of the patches (e.g., Goffredo Baroncelli or Kakra) for clarification. -For more details about the GPLv2 license, see [https://www.gnu.org/licenses/old-licenses/gpl-2.0.html](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html). \ No newline at end of file +For more details about the GPLv2 license, see [https://www.gnu.org/licenses/old-licenses/gpl-2.0.html](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html). diff --git a/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v3.patch b/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v3.patch new file mode 100644 index 0000000..ae70ca6 --- /dev/null +++ b/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v3.patch @@ -0,0 +1,1594 @@ +From 5e49c78f38cc7f5b7ec012021c8422c1db98ef7e Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:04 +0200 +Subject: [PATCH 01/18] btrfs: add flags to give an hint to the chunk allocator + +Add the following flags to give an hint about which chunk should be +allocated in which a disk. +The following flags are created: + +- BTRFS_DEV_ALLOCATION_PREFERRED_DATA + preferred data chunk, but metadata chunk allowed +- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA + preferred metadata chunk, but data chunk allowed +- BTRFS_DEV_ALLOCATION_METADATA_ONLY + only metadata chunk allowed +- BTRFS_DEV_ALLOCATION_DATA_ONLY + only data chunk allowed + +Signed-off-by: Goffredo Baroncelli +--- + include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index fc29d273845d84..71c6135dc7cfb2 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -578,6 +578,20 @@ struct btrfs_node { + struct btrfs_key_ptr ptrs[]; + } __attribute__ ((__packed__)); + ++/* dev_item.type */ ++ ++/* btrfs chunk allocation hints */ ++#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3 ++/* preferred data chunk, but metadata chunk allowed */ ++#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL) ++/* preferred metadata chunk, but data chunk allowed */ ++#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL) ++/* only metadata chunk are allowed */ ++#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) ++/* only data chunk allowed */ ++#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) ++/* 5..7 are unused values */ ++ + struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + +From 160344ae9ae37b32593adc43716172c37b0a734c Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:05 +0200 +Subject: [PATCH 02/18] btrfs: export dev_item.type in + /sys/fs/btrfs//devinfo//type + +Signed-off-by: Goffredo Baroncelli +--- + fs/btrfs/sysfs.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 03926ad467c919..fe07a7cbcf74c4 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1972,6 +1972,16 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, + } + BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); + ++static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, ++ struct kobj_attribute *a, char *buf) ++{ ++ struct btrfs_device *device = container_of(kobj, struct btrfs_device, ++ devid_kobj); ++ ++ return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); ++} ++BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); ++ + /* + * Information about one device. + * +@@ -1985,6 +1995,7 @@ static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, scrub_speed_max), + BTRFS_ATTR_PTR(devid, writeable), ++ BTRFS_ATTR_PTR(devid, type), + NULL + }; + ATTRIBUTE_GROUPS(devid); + +From 29637f2e3a69fe77a8097bd772a8a7803b9ec576 Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:06 +0200 +Subject: [PATCH 03/18] btrfs: change the DEV_ITEM 'type' field via sysfs + +Signed-off-by: Kai Krakow +--- + fs/btrfs/sysfs.c | 56 +++++++++++++++++++++++++++++++++++++++++++++- + fs/btrfs/volumes.c | 2 +- + fs/btrfs/volumes.h | 2 ++ + 3 files changed, 58 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index fe07a7cbcf74c4..3675d961b39a2a 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1980,7 +1980,61 @@ static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, + + return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); + } +-BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); ++ ++static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, ++ struct kobj_attribute *a, ++ const char *buf, size_t len) ++{ ++ struct btrfs_fs_info *fs_info; ++ struct btrfs_root *root; ++ struct btrfs_device *device; ++ int ret; ++ struct btrfs_trans_handle *trans; ++ ++ u64 type, prev_type; ++ ++ device = container_of(kobj, struct btrfs_device, devid_kobj); ++ fs_info = device->fs_info; ++ if (!fs_info) ++ return -EPERM; ++ ++ root = fs_info->chunk_root; ++ if (sb_rdonly(fs_info->sb)) ++ return -EROFS; ++ ++ ret = kstrtou64(buf, 0, &type); ++ if (ret < 0) ++ return -EINVAL; ++ ++ /* for now, allow to touch only the 'allocation hint' bits */ ++ if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)) ++ return -EINVAL; ++ ++ trans = btrfs_start_transaction(root, 1); ++ if (IS_ERR(trans)) ++ return PTR_ERR(trans); ++ ++ prev_type = device->type; ++ device->type = type; ++ ++ ret = btrfs_update_device(trans, device); ++ ++ if (ret < 0) { ++ btrfs_abort_transaction(trans, ret); ++ btrfs_end_transaction(trans); ++ goto abort; ++ } ++ ++ ret = btrfs_commit_transaction(trans); ++ if (ret < 0) ++ goto abort; ++ ++ return len; ++abort: ++ device->type = prev_type; ++ return ret; ++} ++BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); + + /* + * Information about one device. +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index eb51b609190fb5..620a9ea74e7558 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2882,7 +2882,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path + return ret; + } + +-static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, ++noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) + { + int ret; +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 4481575dd70f35..7bb14d51bffc58 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -836,6 +836,8 @@ int btrfs_bg_type_to_factor(u64 flags); + const char *btrfs_bg_type_to_raid_name(u64 flags); + int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); + bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); ++int btrfs_update_device(struct btrfs_trans_handle *trans, ++ struct btrfs_device *device); + + bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); + +From 970b99e160487e9765b6e7db9f8a89a96ce79811 Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:07 +0200 +Subject: [PATCH 04/18] btrfs: add allocator_hint mode + +When this mode is enabled, the chunk allocation policy is modified as +follow. + +Each disk may have a different tag: +- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA +- BTRFS_DEV_ALLOCATION_METADATA_ONLY +- BTRFS_DEV_ALLOCATION_DATA_ONLY +- BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default) + +Where: +- ALLOCATION_PREFERRED_X means that it is preferred to use this disk for +the X chunk type (the other type may be allowed when the space is low) +- ALLOCATION_X_ONLY means that it is used *only* for the X chunk type. +This means also that it is a preferred choice. + +Each time the allocator allocates a chunk of type X , first it takes the +disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X; if the space +is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY; +if the space is not enough, it uses also the other disks, with the +exception of the one marked as ALLOCATION_PREFERRED_Y, where Y the other +type of chunk (i.e. not X). + +Signed-off-by: Goffredo Baroncelli +--- + fs/btrfs/volumes.c | 97 +++++++++++++++++++++++++++++++++++++++++++++- + fs/btrfs/volumes.h | 1 + + 2 files changed, 97 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 620a9ea74e7558..e66700fc8dcd4e 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -184,6 +184,19 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags + return BTRFS_BG_FLAG_TO_INDEX(profile); + } + ++#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \ ++ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1) ++#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \ ++ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) ++ ++static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { ++ [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, ++ [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, ++ [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, ++ [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, ++ /* the other values are set to 0 */ ++}; ++ + const char *btrfs_bg_type_to_raid_name(u64 flags) + { + const int index = btrfs_bg_flags_to_raid_index(flags); +@@ -5022,13 +5035,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, + } + + /* +- * sort the devices in descending order by max_avail, total_avail ++ * sort the devices in descending order by alloc_hint, ++ * max_avail, total_avail + */ + static int btrfs_cmp_device_info(const void *a, const void *b) + { + const struct btrfs_device_info *di_a = a; + const struct btrfs_device_info *di_b = b; + ++ if (di_a->alloc_hint > di_b->alloc_hint) ++ return -1; ++ if (di_a->alloc_hint < di_b->alloc_hint) ++ return 1; + if (di_a->max_avail > di_b->max_avail) + return -1; + if (di_a->max_avail < di_b->max_avail) +@@ -5181,6 +5199,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + int ndevs = 0; + u64 max_avail; + u64 dev_offset; ++ int hint; ++ int i; + + /* + * in the first pass through the devices list, we gather information +@@ -5233,16 +5253,91 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; ++ ++ if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && ++ (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { ++ /* ++ * if mixed bg set all the alloc_hint ++ * fields to the same value, so the sorting ++ * is not affected ++ */ ++ devices_info[ndevs].alloc_hint = 0; ++ } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { ++ hint = device->type & BTRFS_DEV_ALLOCATION_MASK; ++ ++ /* ++ * skip BTRFS_DEV_METADATA_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) ++ continue; ++ /* ++ * if a data chunk must be allocated, ++ * sort also by hint (data disk ++ * higher priority) ++ */ ++ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; ++ } else { /* BTRFS_BLOCK_GROUP_METADATA */ ++ hint = device->type & BTRFS_DEV_ALLOCATION_MASK; ++ ++ /* ++ * skip BTRFS_DEV_DATA_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) ++ continue; ++ /* ++ * if a data chunk must be allocated, ++ * sort also by hint (metadata hint ++ * higher priority) ++ */ ++ devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; ++ } ++ + ++ndevs; + } + ctl->ndevs = ndevs; + ++ /* ++ * no devices available ++ */ ++ if (!ndevs) ++ return 0; ++ + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + ++ /* ++ * select the minimum set of disks grouped by hint that ++ * can host the chunk ++ */ ++ ndevs = 0; ++ while (ndevs < ctl->ndevs) { ++ hint = devices_info[ndevs++].alloc_hint; ++ while (ndevs < ctl->ndevs && ++ devices_info[ndevs].alloc_hint == hint) ++ ndevs++; ++ if (ndevs >= ctl->devs_min) ++ break; ++ } ++ ++ BUG_ON(ndevs > ctl->ndevs); ++ ctl->ndevs = ndevs; ++ ++ /* ++ * the next layers require the devices_info ordered by ++ * max_avail. If we are returing two (or more) different ++ * group of alloc_hint, this is not always true. So sort ++ * these gain. ++ */ ++ ++ for (i = 0 ; i < ndevs ; i++) ++ devices_info[i].alloc_hint = 0; ++ ++ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), ++ btrfs_cmp_device_info, NULL); ++ + return 0; + } + +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 7bb14d51bffc58..f3c5437e270a22 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -565,6 +565,7 @@ struct btrfs_device_info { + u64 dev_offset; + u64 max_avail; + u64 total_avail; ++ int alloc_hint; + }; + + struct btrfs_raid_attr { + +From 1c1f2e27d3055b7721468c6980479a043f48e2b3 Mon Sep 17 00:00:00 2001 +From: Kai Krakow +Date: Thu, 27 Jun 2024 20:05:58 +0200 +Subject: [PATCH 05/18] btrfs: add allocator_hint for no allocation preferred + +This is useful where you want to prevent new allocations of chunks on a +disk which is going to removed from the pool anyways, e.g. due to bad +blocks or because it's slow. + +Signed-off-by: Kai Krakow +--- + fs/btrfs/volumes.c | 6 +++++- + include/uapi/linux/btrfs_tree.h | 2 ++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index e66700fc8dcd4e..c6aa93fae9aa65 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -194,6 +194,7 @@ static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, + [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, ++ [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99, + /* the other values are set to 0 */ + }; + +@@ -5289,7 +5290,10 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + * sort also by hint (metadata hint + * higher priority) + */ +- devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; ++ if (hint == BTRFS_DEV_ALLOCATION_PREFERRED_NONE) ++ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; ++ else ++ devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } + + ++ndevs; +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index 71c6135dc7cfb2..92bcc59b129a97 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -590,6 +590,8 @@ struct btrfs_node { + #define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) + /* only data chunk allowed */ + #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) ++/* preferred no chunk, but chunks allowed */ ++#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) + /* 5..7 are unused values */ + + struct btrfs_dev_item { + +From 82553effe6b655f97478b6d13df7ab0ecc192e58 Mon Sep 17 00:00:00 2001 +From: Kai Krakow +Date: Fri, 6 Dec 2024 00:55:31 +0100 +Subject: [PATCH 06/18] btrfs: add allocator_hint to disable allocation + completely + +This is useful where you want to prevent new allocations of chunks to +a set of multiple disks which are going to be removed from the pool. +This acts as a multiple `btrfs dev remove` on steroids that can remove +multiple disks in parallel without moving data to disks which would be +removed in the next round. In such cases, it will avoid moving the +same data multiple times, and thus avoid placing it on potentially bad +disks. + +Thanks to @Zygo for the explanation and suggestion. + +Link: https://github.com/kdave/btrfs-progs/issues/907#issuecomment-2520897104 +Signed-off-by: Kai Krakow +--- + fs/btrfs/volumes.c | 11 +++++++++++ + include/uapi/linux/btrfs_tree.h | 4 +++- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index c6aa93fae9aa65..99d2c60ac2bf3e 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -190,6 +190,7 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) + + static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { ++ [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99, + [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, +@@ -5271,6 +5272,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + */ + if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) + continue; ++ /* ++ * skip BTRFS_DEV_NONE_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) ++ continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk +@@ -5285,6 +5291,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + */ + if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) + continue; ++ /* ++ * skip BTRFS_DEV_NONE_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) ++ continue; + /* + * if a data chunk must be allocated, + * sort also by hint (metadata hint +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index 92bcc59b129a97..3db20734aacfc6 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -592,7 +592,9 @@ struct btrfs_node { + #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) + /* preferred no chunk, but chunks allowed */ + #define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) +-/* 5..7 are unused values */ ++/* no chunks allowed */ ++#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL) ++/* 6..7 are unused values */ + + struct btrfs_dev_item { + /* the internal btrfs device id */ + +From 10248db4c682397c83b99daa2de4ee0e587c0be2 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:31 +0800 +Subject: [PATCH 07/18] btrfs: simplify output formatting in + btrfs_read_policy_show + +Refactor the logic in btrfs_read_policy_show() to streamline the +formatting of read policies output. Streamline the space and bracket +handling around the active policy without altering the functional output. +This is in preparation to add more methods. + +Signed-off-by: Anand Jain +--- + fs/btrfs/sysfs.c | 18 ++++++++++-------- + 1 file changed, 10 insertions(+), 8 deletions(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 3675d961b39a2a..cde47f1c11757f 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1316,14 +1316,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { +- if (policy == i) +- ret += sysfs_emit_at(buf, ret, "%s[%s]", +- (ret == 0 ? "" : " "), +- btrfs_read_policy_name[i]); +- else +- ret += sysfs_emit_at(buf, ret, "%s%s", +- (ret == 0 ? "" : " "), +- btrfs_read_policy_name[i]); ++ if (ret != 0) ++ ret += sysfs_emit_at(buf, ret, " "); ++ ++ if (i == policy) ++ ret += sysfs_emit_at(buf, ret, "["); ++ ++ ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); ++ ++ if (i == policy) ++ ret += sysfs_emit_at(buf, ret, "]"); + } + + ret += sysfs_emit_at(buf, ret, "\n"); + +From 4a49a279c14d9003fd7d4865706bc78142bf1645 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:30 +0800 +Subject: [PATCH 08/18] btrfs: initialize fs_devices->fs_info earlier + +Currently, fs_devices->fs_info is initialized in btrfs_init_devices_late(), +but this occurs too late for find_live_mirror(), which is invoked by +load_super_root() much earlier than btrfs_init_devices_late(). + +Fix this by moving the initialization to open_ctree(), before load_super_root(). + +Reviewed-by: Naohiro Aota +Signed-off-by: Anand Jain +--- + fs/btrfs/disk-io.c | 1 + + fs/btrfs/volumes.c | 2 -- + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index b11bfe68dd65fb..a4d2c5bcd93c52 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3324,6 +3324,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; + fs_info->stripesize = stripesize; ++ fs_info->fs_devices->fs_info = fs_info; + + /* + * Handle the space caching options appropriately now that we have the +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 99d2c60ac2bf3e..21cc02df8edf06 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -7577,8 +7577,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) + struct btrfs_device *device; + int ret = 0; + +- fs_devices->fs_info = fs_info; +- + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + +From ccb29226710d52abbd737fd0b2f438022c045af4 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:32 +0800 +Subject: [PATCH 09/18] btrfs: add btrfs_read_policy_to_enum helper and + refactor read policy store + +Introduce the `btrfs_read_policy_to_enum` helper function to simplify the +conversion of a string read policy to its corresponding enum value. This +reduces duplication and improves code clarity in `btrfs_read_policy_store`. +The `btrfs_read_policy_store` function has been refactored to use the new +helper. + +The parameter is copied locally to allow modification, enabling the +separation of the method and its value. This prepares for the addition of +more functionality in subsequent patches. + +Signed-off-by: Anand Jain +--- + fs/btrfs/sysfs.c | 34 ++++++++++++++++++++++------------ + 1 file changed, 22 insertions(+), 12 deletions(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index cde47f1c11757f..8540af0807648e 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1307,6 +1307,18 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + + static const char * const btrfs_read_policy_name[] = { "pid" }; + ++static int btrfs_read_policy_to_enum(const char *str) ++{ ++ char param[32] = {'\0'}; ++ ++ if (!str || strlen(str) == 0) ++ return 0; ++ ++ strncpy(param, str, sizeof(param) - 1); ++ ++ return sysfs_match_string(btrfs_read_policy_name, param); ++} ++ + static ssize_t btrfs_read_policy_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) + { +@@ -1338,21 +1350,19 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, + const char *buf, size_t len) + { + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); +- int i; ++ int index; + +- for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { +- if (sysfs_streq(buf, btrfs_read_policy_name[i])) { +- if (i != READ_ONCE(fs_devices->read_policy)) { +- WRITE_ONCE(fs_devices->read_policy, i); +- btrfs_info(fs_devices->fs_info, +- "read policy set to '%s'", +- btrfs_read_policy_name[i]); +- } +- return len; +- } ++ index = btrfs_read_policy_to_enum(buf); ++ if (index < 0) ++ return -EINVAL; ++ ++ if (index != READ_ONCE(fs_devices->read_policy)) { ++ WRITE_ONCE(fs_devices->read_policy, index); ++ btrfs_info(fs_devices->fs_info, "read policy set to '%s'", ++ btrfs_read_policy_name[index]); + } + +- return -EINVAL; ++ return len; + } + BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); + + +From 0d64f4e5c07f46183984b5a407032c3fc36e3f3a Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:34 +0800 +Subject: [PATCH 10/18] btrfs: add read count tracking for filesystem stats + +Add fs_devices::read_cnt_blocks to track read blocks, initialize it in +open_fs_devices() and clean it up in close_fs_devices(). +btrfs_submit_dev_bio() increments it for reads when stats tracking is +enabled. Stats tracking is disabled by default and is enabled through +fs_devices::fs_stats when required. + +The code is not under the EXPERIMENTAL define, as stats can be expanded +to include write counts and other performance counters, with the user +interface independent of its internal use. + +This is an in-memory-only feature, different to the dev error stats. + +Signed-off-by: Anand Jain +--- + fs/btrfs/bio.c | 8 ++++++++ + fs/btrfs/volumes.c | 8 +++++++- + fs/btrfs/volumes.h | 7 ++++++- + 3 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c +index 7e0f9600b80c43..24f2c77983faf4 100644 +--- a/fs/btrfs/bio.c ++++ b/fs/btrfs/bio.c +@@ -450,6 +450,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), + dev->devid, bio->bi_iter.bi_size); + ++ /* ++ * Track reads if tracking is enabled; ignore I/O operations before ++ * fully initialized. ++ */ ++ if (dev->fs_devices->fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) ++ percpu_counter_add(&dev->fs_devices->read_cnt_blocks, ++ bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); ++ + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) + blkcg_punt_bio_submit(bio); + else +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 21cc02df8edf06..a241e0684741a0 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1161,6 +1161,7 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) + btrfs_close_one_device(device); + ++ percpu_counter_destroy(&fs_devices->read_cnt_blocks); + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; +@@ -1207,6 +1208,11 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *tmp_device; + int ret = 0; + ++ /* Initialize the in-memory record of filesystem read count */ ++ ret = percpu_counter_init(&fs_devices->read_cnt_blocks, 0, GFP_KERNEL); ++ if (ret) ++ return ret; ++ + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret2; +@@ -7678,7 +7684,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) +- goto out; ++ return ret; + } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index f3c5437e270a22..d479647af94f73 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -185,7 +185,7 @@ struct btrfs_device { + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; + +- /* Counter to record the change of device stats */ ++ /* Counter to record of the change of device stats */ + atomic_t dev_stats_ccnt; + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; + +@@ -417,6 +417,8 @@ struct btrfs_fs_devices { + bool seeding; + /* The mount needs to use a randomly generated fsid. */ + bool temp_fsid; ++ /* Enable/disable the filesystem stats tracking */ ++ bool fs_stats; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ +@@ -427,6 +429,9 @@ struct btrfs_fs_devices { + + enum btrfs_chunk_allocation_policy chunk_alloc_policy; + ++ /* Tracks the number of blocks (sectors) read from the filesystem. */ ++ struct percpu_counter read_cnt_blocks; ++ + /* Policy used to read the mirrored stripes. */ + enum btrfs_read_policy read_policy; + + +From 9574a9b5e70fabb7e1f255ec023a91c464a07f99 Mon Sep 17 00:00:00 2001 +From: Kai Krakow +Date: Mon, 16 Sep 2024 18:18:25 +0930 +Subject: [PATCH 11/18] btrfs: introduce CONFIG_BTRFS_EXPERIMENTAL from 6.13 + +CONFIG_BTRFS_EXPERIMENTAL is needed by the RAID1 balancing patches but +we don't want to use the full scope of the 6.13 patch because it also +affects features currently masked via CONFIG_BTRFS_DEBUG. + +TODO: Drop during rebase to 6.13 or later. +Original-author: Qu Wenruo +Signed-off-by: Kai Krakow +--- + fs/btrfs/Kconfig | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig +index 4fb925e8c981d8..ead317f1eeb859 100644 +--- a/fs/btrfs/Kconfig ++++ b/fs/btrfs/Kconfig +@@ -78,6 +78,15 @@ config BTRFS_ASSERT + + If unsure, say N. + ++config BTRFS_EXPERIMENTAL ++ bool "Btrfs experimental features" ++ depends on BTRFS_FS ++ help ++ Enable experimental features. These features may not be stable enough ++ for end users. This is meant for btrfs developers only. ++ ++ If unsure, say N. ++ + config BTRFS_FS_REF_VERIFY + bool "Btrfs with the ref verify tool compiled in" + depends on BTRFS_FS + +From 12d99a1aad06ab2193ba051142cf5b96fef90e57 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:33 +0800 +Subject: [PATCH 12/18] btrfs: handle value associated with raid1 balancing + parameter + +This change enables specifying additional configuration values alongside +the raid1 balancing / read policy in a single input string. + +Updated btrfs_read_policy_to_enum() to parse and handle a value associated +with the policy in the format `policy:value`, the value part if present is +converted 64-bit integer. Update btrfs_read_policy_store() to accommodate +the new parameter. + +Signed-off-by: Anand Jain +--- + fs/btrfs/sysfs.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 8540af0807648e..b0e624c0598f48 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1307,15 +1307,26 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + + static const char * const btrfs_read_policy_name[] = { "pid" }; + +-static int btrfs_read_policy_to_enum(const char *str) ++static int btrfs_read_policy_to_enum(const char *str, s64 *value) + { + char param[32] = {'\0'}; ++ char *__maybe_unused value_str; + + if (!str || strlen(str) == 0) + return 0; + + strncpy(param, str, sizeof(param) - 1); + ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ /* Separate value from input in policy:value format. */ ++ if ((value_str = strchr(param, ':'))) { ++ *value_str = '\0'; ++ value_str++; ++ if (value && kstrtou64(value_str, 10, value) != 0) ++ return -EINVAL; ++ } ++#endif ++ + return sysfs_match_string(btrfs_read_policy_name, param); + } + +@@ -1351,8 +1362,9 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, + { + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int index; ++ s64 value = -1; + +- index = btrfs_read_policy_to_enum(buf); ++ index = btrfs_read_policy_to_enum(buf, &value); + if (index < 0) + return -EINVAL; + + +From f8cb6bc96502ae95523385c28078789b0c6ad90c Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:35 +0800 +Subject: [PATCH 13/18] btrfs: introduce RAID1 round-robin read balancing + +This feature balances I/O across the striped devices when reading from +RAID1 blocks. + + echo round-robin[:min_contiguous_read] > /sys/fs/btrfs//read_policy + +The min_contiguous_read parameter defines the minimum read size before +switching to the next mirrored device. This setting is optional, with a +default value of 192KiB. + +Signed-off-by: Anand Jain +--- + fs/btrfs/sysfs.c | 49 ++++++++++++++++++++++++++++++- + fs/btrfs/volumes.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/volumes.h | 11 +++++++ + 3 files changed, 131 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index b0e624c0598f48..25bbbbc56e3fdc 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, + } + BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + +-static const char * const btrfs_read_policy_name[] = { "pid" }; ++static const char *btrfs_read_policy_name[] = { ++ "pid", ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ "round-robin", ++#endif ++}; + + static int btrfs_read_policy_to_enum(const char *str, s64 *value) + { +@@ -1347,6 +1352,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ if (i == BTRFS_READ_POLICY_RR) ++ ret += sysfs_emit_at(buf, ret, ":%d", ++ READ_ONCE(fs_devices->rr_min_contiguous_read)); ++#endif ++ + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); + } +@@ -1368,6 +1379,42 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, + if (index < 0) + return -EINVAL; + ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ /* If moving out of RR then disable fs_stats */ ++ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && ++ index != BTRFS_READ_POLICY_RR) ++ fs_devices->fs_stats = false; ++ ++ if (index == BTRFS_READ_POLICY_RR) { ++ if (value != -1) { ++ u32 sectorsize = fs_devices->fs_info->sectorsize; ++ ++ if (!IS_ALIGNED(value, sectorsize)) { ++ u64 temp_value = round_up(value, sectorsize); ++ ++ btrfs_warn(fs_devices->fs_info, ++"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu", ++ value, sectorsize, temp_value); ++ value = temp_value; ++ } ++ } else { ++ value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; ++ } ++ ++ if (index != READ_ONCE(fs_devices->read_policy) || ++ value != READ_ONCE(fs_devices->rr_min_contiguous_read)) { ++ WRITE_ONCE(fs_devices->read_policy, index); ++ WRITE_ONCE(fs_devices->rr_min_contiguous_read, value); ++ ++ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", ++ btrfs_read_policy_name[index], value); ++ } ++ ++ fs_devices->fs_stats = true; ++ ++ return len; ++ } ++#endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index a241e0684741a0..e19eb176c0a362 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1241,6 +1241,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, + fs_devices->total_rw_bytes = 0; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; + fs_devices->read_policy = BTRFS_READ_POLICY_PID; ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; ++#endif + + return 0; + } +@@ -5976,6 +5979,70 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) + return ret; + } + ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++struct stripe_mirror { ++ u64 devid; ++ int num; ++}; ++ ++static int btrfs_cmp_devid(const void *a, const void *b) ++{ ++ const struct stripe_mirror *s1 = (struct stripe_mirror *)a; ++ const struct stripe_mirror *s2 = (struct stripe_mirror *)b; ++ ++ if (s1->devid < s2->devid) ++ return -1; ++ if (s1->devid > s2->devid) ++ return 1; ++ return 0; ++} ++ ++/* ++ * btrfs_read_rr. ++ * ++ * Select a stripe for reading using a round-robin algorithm: ++ * ++ * 1. Compute the read cycle as the total sectors read divided by the minimum ++ * sectors per device. ++ * 2. Determine the stripe number for the current read by taking the modulus ++ * of the read cycle with the total number of stripes: ++ * ++ * stripe index = (total sectors / min sectors per dev) % num stripes ++ * ++ * The calculated stripe index is then used to select the corresponding device ++ * from the list of devices, which is ordered by devid. ++ */ ++static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe) ++{ ++ struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0}; ++ struct btrfs_device *device = map->stripes[first].dev; ++ struct btrfs_fs_devices *fs_devices = device->fs_devices; ++ int read_cycle; ++ int index; ++ int ret_stripe; ++ int total_reads; ++ int min_reads_per_dev; ++ ++ total_reads = percpu_counter_sum(&fs_devices->read_cnt_blocks); ++ min_reads_per_dev = READ_ONCE(fs_devices->rr_min_contiguous_read) >> ++ fs_devices->fs_info->sectorsize_bits; ++ ++ index = 0; ++ for (int i = first; i < first + num_stripe; i++) { ++ stripes[index].devid = map->stripes[i].dev->devid; ++ stripes[index].num = i; ++ index++; ++ } ++ sort(stripes, num_stripe, sizeof(struct stripe_mirror), ++ btrfs_cmp_devid, NULL); ++ ++ read_cycle = total_reads / min_reads_per_dev; ++ ret_stripe = stripes[read_cycle % num_stripe].num; ++ ++ return ret_stripe; ++} ++#endif ++ + static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int dev_replace_is_ongoing) +@@ -6005,6 +6072,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, + case BTRFS_READ_POLICY_PID: + preferred_mirror = first + (current->pid % num_stripes); + break; ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ case BTRFS_READ_POLICY_RR: ++ preferred_mirror = btrfs_read_rr(map, first, num_stripes); ++ break; ++#endif + } + + if (dev_replace_is_ongoing && +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index d479647af94f73..a7c18b804f02bd 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy { + BTRFS_CHUNK_ALLOC_ZONED, + }; + ++#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (SZ_256K) ++#define BTRFS_RAID1_MAX_MIRRORS (4) + /* + * Read policies for mirrored block group profiles, read picks the stripe based + * on these policies. +@@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy { + enum btrfs_read_policy { + /* Use process PID to choose the stripe */ + BTRFS_READ_POLICY_PID, ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ /* Balancing raid1 reads across all striped devices (round-robin) */ ++ BTRFS_READ_POLICY_RR, ++#endif + BTRFS_NR_READ_POLICY, + }; + +@@ -435,6 +441,11 @@ struct btrfs_fs_devices { + /* Policy used to read the mirrored stripes. */ + enum btrfs_read_policy read_policy; + ++ #ifdef CONFIG_BTRFS_EXPERIMENTAL ++ /* Min contiguous reads before switching to next device. */ ++ int rr_min_contiguous_read; ++#endif ++ + #ifdef CONFIG_BTRFS_DEBUG + /* Checksum mode - offload it or do it synchronously. */ + enum btrfs_offload_csum_mode offload_csum_mode; + +From 9a762b6f63f367856bbd521c184ceb3a0260def0 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:36 +0800 +Subject: [PATCH 14/18] btrfs: add RAID1 preferred read device + +When there's stale data on a mirrored device, this feature lets you choose +which device to read from. Mainly used for testing. + +echo "devid:" > /sys/fs/btrfs//read_policy + +Signed-off-by: Anand Jain +--- + fs/btrfs/sysfs.c | 33 ++++++++++++++++++++++++++++++++- + fs/btrfs/volumes.c | 21 +++++++++++++++++++++ + fs/btrfs/volumes.h | 5 +++++ + 3 files changed, 58 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 25bbbbc56e3fdc..fb0bb7d830b8e8 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = { + "pid", + #ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", ++ "devid", + #endif + }; + +@@ -1356,8 +1357,11 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%d", + READ_ONCE(fs_devices->rr_min_contiguous_read)); +-#endif + ++ if (i == BTRFS_READ_POLICY_DEVID) ++ ret += sysfs_emit_at(buf, ret, ":%llu", ++ READ_ONCE(fs_devices->read_devid)); ++#endif + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); + } +@@ -1414,6 +1418,33 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, + + return len; + } ++ ++ if (index == BTRFS_READ_POLICY_DEVID) { ++ ++ if (value != -1) { ++ BTRFS_DEV_LOOKUP_ARGS(args); ++ ++ /* Validate input devid */ ++ args.devid = value; ++ if (btrfs_find_device(fs_devices, &args) == NULL) ++ return -EINVAL; ++ } else { ++ /* Set default devid to the devid of the latest device */ ++ value = fs_devices->latest_dev->devid; ++ } ++ ++ if (index != READ_ONCE(fs_devices->read_policy) || ++ (value != READ_ONCE(fs_devices->read_devid))) { ++ WRITE_ONCE(fs_devices->read_policy, index); ++ WRITE_ONCE(fs_devices->read_devid, value); ++ ++ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", ++ btrfs_read_policy_name[index], value); ++ ++ } ++ ++ return len; ++ } + #endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index e19eb176c0a362..4037cd98c453eb 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1243,6 +1243,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, + fs_devices->read_policy = BTRFS_READ_POLICY_PID; + #ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; ++ fs_devices->read_devid = latest_dev->devid; + #endif + + return 0; +@@ -5980,6 +5981,23 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) + } + + #ifdef CONFIG_BTRFS_EXPERIMENTAL ++static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, ++ int num_stripe) ++{ ++ int last = first + num_stripe; ++ int stripe_index; ++ ++ for (stripe_index = first; stripe_index < last; stripe_index++) { ++ struct btrfs_device *device = map->stripes[stripe_index].dev; ++ ++ if (device->devid == READ_ONCE(device->fs_devices->read_devid)) ++ return stripe_index; ++ } ++ ++ /* If no read-preferred device, use first stripe */ ++ return first; ++} ++ + struct stripe_mirror { + u64 devid; + int num; +@@ -6076,6 +6094,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; ++ case BTRFS_READ_POLICY_DEVID: ++ preferred_mirror = btrfs_read_preferred(map, first, num_stripes); ++ break; + #endif + } + +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index a7c18b804f02bd..4a8ae242ad6feb 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -308,6 +308,8 @@ enum btrfs_read_policy { + #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing raid1 reads across all striped devices (round-robin) */ + BTRFS_READ_POLICY_RR, ++ /* Read from the specific device */ ++ BTRFS_READ_POLICY_DEVID, + #endif + BTRFS_NR_READ_POLICY, + }; +@@ -444,6 +446,9 @@ struct btrfs_fs_devices { + #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Min contiguous reads before switching to next device. */ + int rr_min_contiguous_read; ++ ++ /* Device to be used for reading in case of RAID1. */ ++ u64 read_devid; + #endif + + #ifdef CONFIG_BTRFS_DEBUG + +From 5b3086c2eef1045362a9bf1790653c6ea69ffa72 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:37 +0800 +Subject: [PATCH 15/18] btrfs: expose experimental mode in module information + +Commit c9c49e8f157e ("btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from +CONFIG_BTRFS_DEBUG") introduces a way to enable or disable experimental +features, print its status during module load, like so: + + Btrfs loaded, experimental=on, debug=on, assert=on, zoned=yes, fsverity=yes + +Signed-off-by: Anand Jain +--- + fs/btrfs/super.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index c64d0713412231..4742bb2af601a7 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -2468,6 +2468,9 @@ static __cold void btrfs_interface_exit(void) + static int __init btrfs_print_mod_info(void) + { + static const char options[] = "" ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ ", experimental=on" ++#endif + #ifdef CONFIG_BTRFS_DEBUG + ", debug=on" + #endif + +From f1522b88884c49c686f0a1ff80a852c3791d197c Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:38 +0800 +Subject: [PATCH 16/18] btrfs: enable RAID1 balancing configuration via + modprobe parameter + +This update allows configuring the `raid1-balancing` methods using a +modprobe parameter when experimental mode CONFIG_BTRFS_EXPERIMENTAL +is enabled. + +Examples: + +- Set the RAID1 balancing method to round-robin with a custom +`min_contiguous_read` of 4k: + $ modprobe btrfs raid1-balancing=round-robin:4096 + +- Set the round-robin balancing method with the default +`min_contiguous_read`: + $ modprobe btrfs raid1-balancing=round-robin + +- Set the `devid` balancing method, defaulting to the latest +device: + $ modprobe btrfs raid1-balancing=devid + +Signed-off-by: Anand Jain +--- + fs/btrfs/super.c | 5 +++++ + fs/btrfs/sysfs.c | 30 +++++++++++++++++++++++++++++- + fs/btrfs/sysfs.h | 5 +++++ + fs/btrfs/volumes.c | 14 +++++++++++++- + 4 files changed, 52 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index 4742bb2af601a7..ae0fe3ed33fbce 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -2549,6 +2549,11 @@ static const struct init_sequence mod_init_seq[] = { + }, { + .init_func = extent_map_init, + .exit_func = extent_map_exit, ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ }, { ++ .init_func = btrfs_raid1_balancing_init, ++ .exit_func = NULL, ++#endif + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index fb0bb7d830b8e8..c8f2d625568b5d 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1313,7 +1313,21 @@ static const char *btrfs_read_policy_name[] = { + #endif + }; + +-static int btrfs_read_policy_to_enum(const char *str, s64 *value) ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++/* Global module configuration parameters */ ++static char *raid1_balancing; ++char *btrfs_get_raid1_balancing(void) ++{ ++ return raid1_balancing; ++} ++ ++/* Set perm 0, disable sys/module/btrfs/parameter/raid1_balancing interface */ ++module_param(raid1_balancing, charp, 0); ++MODULE_PARM_DESC(raid1_balancing, ++"Global read policy; pid (default), round-robin[:min_contiguous_read], devid[[:devid]|[:latest-gen]|[:oldest-gen]]"); ++#endif ++ ++int btrfs_read_policy_to_enum(const char *str, s64 *value) + { + char param[32] = {'\0'}; + char *__maybe_unused value_str; +@@ -1336,6 +1350,20 @@ static int btrfs_read_policy_to_enum(const char *str, s64 *value) + return sysfs_match_string(btrfs_read_policy_name, param); + } + ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++int __init btrfs_raid1_balancing_init(void) ++{ ++ s64 value; ++ ++ if (btrfs_read_policy_to_enum(raid1_balancing, &value) == -EINVAL) { ++ btrfs_err(NULL, "Invalid raid1_balancing %s", raid1_balancing); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ + static ssize_t btrfs_read_policy_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) + { +diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h +index e6a284c59809c9..e97d383b9ffcd4 100644 +--- a/fs/btrfs/sysfs.h ++++ b/fs/btrfs/sysfs.h +@@ -47,5 +47,10 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); + int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); + void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); ++int btrfs_read_policy_to_enum(const char *str, s64 *value); ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++int __init btrfs_raid1_balancing_init(void); ++char *btrfs_get_raid1_balancing(void); ++#endif + + #endif +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 4037cd98c453eb..cbd763d2104c01 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1206,6 +1206,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device; + struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; ++ s64 __maybe_unused value = 0; + int ret = 0; + + /* Initialize the in-memory record of filesystem read count */ +@@ -1240,10 +1241,21 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, + fs_devices->latest_dev = latest_dev; + fs_devices->total_rw_bytes = 0; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +- fs_devices->read_policy = BTRFS_READ_POLICY_PID; + #ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; + fs_devices->read_devid = latest_dev->devid; ++ fs_devices->read_policy = ++ btrfs_read_policy_to_enum(btrfs_get_raid1_balancing(), &value); ++ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) ++ fs_devices->fs_stats = true; ++ if (value) { ++ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) ++ fs_devices->rr_min_contiguous_read = value; ++ if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) ++ fs_devices->read_devid = value; ++ } ++#else ++ fs_devices->read_policy = BTRFS_READ_POLICY_PID; + #endif + + return 0; + +From 74b431b2bcc285ae3fda9676588ba767e191a71c Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 2 Jan 2025 02:06:39 +0800 +Subject: [PATCH 17/18] btrfs: modload to print RAID1 balancing status + +Modified the Btrfs loading message to include the RAID1 balancing status +if the experimental feature is enabled. + +Signed-off-by: Anand Jain +--- + fs/btrfs/super.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index ae0fe3ed33fbce..4e73613a1b00c1 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -2491,7 +2491,17 @@ static int __init btrfs_print_mod_info(void) + ", fsverity=no" + #endif + ; ++ ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++ if (btrfs_get_raid1_balancing() == NULL) ++ pr_info("Btrfs loaded%s\n", options); ++ else ++ pr_info("Btrfs loaded%s, raid1_balancing=%s\n", ++ options, btrfs_get_raid1_balancing()); ++#else + pr_info("Btrfs loaded%s\n", options); ++#endif ++ + return 0; + } + + +From 419062795fa680b30b727a0dc3338874b81711ed Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Fri, 11 Oct 2024 10:49:17 +0800 +Subject: [PATCH 18/18] btrfs: use the path with the lowest latency for RAID1 + reads + +This feature aims to direct the read I/O to the device with the lowest +known latency for reading RAID1 blocks. + +echo "latency" > /sys/fs/btrfs//read_policy + +Co-authored-by: Kai Krakow +Signed-off-by: Anand Jain +--- + fs/btrfs/sysfs.c | 3 ++- + fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++++++++ + fs/btrfs/volumes.h | 2 ++ + 3 files changed, 40 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index c8f2d625568b5d..0e616078ae8107 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = { + "pid", + #ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", ++ "latency", + "devid", + #endif + }; +@@ -1324,7 +1325,7 @@ char *btrfs_get_raid1_balancing(void) + /* Set perm 0, disable sys/module/btrfs/parameter/raid1_balancing interface */ + module_param(raid1_balancing, charp, 0); + MODULE_PARM_DESC(raid1_balancing, +-"Global read policy; pid (default), round-robin[:min_contiguous_read], devid[[:devid]|[:latest-gen]|[:oldest-gen]]"); ++"Global read policy; pid (default), round-robin[:min_contiguous_read], latency, devid[[:devid]|[:latest-gen]|[:oldest-gen]]"); + #endif + + int btrfs_read_policy_to_enum(const char *str, s64 *value) +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index cbd763d2104c01..ab6c15952a9d47 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -12,6 +12,9 @@ + #include + #include + #include ++#ifdef CONFIG_BTRFS_EXPERIMENTAL ++#include ++#endif + #include "misc.h" + #include "ctree.h" + #include "disk-io.h" +@@ -6010,6 +6013,35 @@ static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, + return first; + } + ++static int btrfs_best_stripe(struct btrfs_fs_info *fs_info, ++ struct btrfs_chunk_map *map, int first, ++ int num_stripe) ++{ ++ u64 best_wait = U64_MAX; ++ int best_stripe = 0; ++ int index; ++ ++ for (index = first; index < first + num_stripe; index++) { ++ u64 read_wait; ++ u64 avg_wait = 0; ++ unsigned long read_ios; ++ struct btrfs_device *device = map->stripes[index].dev; ++ ++ read_wait = part_stat_read(device->bdev, nsecs[READ]); ++ read_ios = part_stat_read(device->bdev, ios[READ]); ++ ++ if (read_wait && read_ios && read_wait >= read_ios) ++ avg_wait = div_u64(read_wait, read_ios); ++ ++ if (best_wait > avg_wait) { ++ best_wait = avg_wait; ++ best_stripe = index; ++ } ++ } ++ ++ return best_stripe; ++} ++ + struct stripe_mirror { + u64 devid; + int num; +@@ -6109,6 +6141,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; ++ case BTRFS_READ_POLICY_LATENCY: ++ preferred_mirror = btrfs_best_stripe(fs_info, map, first, ++ num_stripes); ++ break; + #endif + } + +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 4a8ae242ad6feb..cbd951d7f1dab6 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -308,6 +308,8 @@ enum btrfs_read_policy { + #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing raid1 reads across all striped devices (round-robin) */ + BTRFS_READ_POLICY_RR, ++ /* Use the lowest-latency device dynamically */ ++ BTRFS_READ_POLICY_LATENCY, + /* Read from the specific device */ + BTRFS_READ_POLICY_DEVID, + #endif