2071 lines
66 KiB
Diff
2071 lines
66 KiB
Diff
From 5e49c78f38cc7f5b7ec012021c8422c1db98ef7e Mon Sep 17 00:00:00 2001
|
|
From: Goffredo Baroncelli <kreijack@inwind.it>
|
|
Date: Sun, 24 Oct 2021 17:31:04 +0200
|
|
Subject: [PATCH 01/25] btrfs: add flags to give an hint to the chunk allocator
|
|
|
|
Add the following flags to give an hint about which chunk should be
|
|
allocated in which a disk.
|
|
The following flags are created:
|
|
|
|
- BTRFS_DEV_ALLOCATION_PREFERRED_DATA
|
|
preferred data chunk, but metadata chunk allowed
|
|
- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA
|
|
preferred metadata chunk, but data chunk allowed
|
|
- BTRFS_DEV_ALLOCATION_METADATA_ONLY
|
|
only metadata chunk allowed
|
|
- BTRFS_DEV_ALLOCATION_DATA_ONLY
|
|
only data chunk allowed
|
|
|
|
Signed-off-by: Goffredo Baroncelli <kreijack@inwid.it>
|
|
---
|
|
include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++
|
|
1 file changed, 14 insertions(+)
|
|
|
|
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
|
|
index fc29d273845d84..71c6135dc7cfb2 100644
|
|
--- a/include/uapi/linux/btrfs_tree.h
|
|
+++ b/include/uapi/linux/btrfs_tree.h
|
|
@@ -578,6 +578,20 @@ struct btrfs_node {
|
|
struct btrfs_key_ptr ptrs[];
|
|
} __attribute__ ((__packed__));
|
|
|
|
+/* dev_item.type */
|
|
+
|
|
+/* btrfs chunk allocation hints */
|
|
+#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3
|
|
+/* preferred data chunk, but metadata chunk allowed */
|
|
+#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL)
|
|
+/* preferred metadata chunk, but data chunk allowed */
|
|
+#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL)
|
|
+/* only metadata chunk are allowed */
|
|
+#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL)
|
|
+/* only data chunk allowed */
|
|
+#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL)
|
|
+/* 5..7 are unused values */
|
|
+
|
|
struct btrfs_dev_item {
|
|
/* the internal btrfs device id */
|
|
__le64 devid;
|
|
|
|
From 160344ae9ae37b32593adc43716172c37b0a734c Mon Sep 17 00:00:00 2001
|
|
From: Goffredo Baroncelli <kreijack@inwind.it>
|
|
Date: Sun, 24 Oct 2021 17:31:05 +0200
|
|
Subject: [PATCH 02/25] btrfs: export dev_item.type in
|
|
/sys/fs/btrfs/<uuid>/devinfo/<devid>/type
|
|
|
|
Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
|
|
---
|
|
fs/btrfs/sysfs.c | 11 +++++++++++
|
|
1 file changed, 11 insertions(+)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 03926ad467c919..fe07a7cbcf74c4 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1972,6 +1972,16 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
|
|
}
|
|
BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
|
|
|
|
+static ssize_t btrfs_devinfo_type_show(struct kobject *kobj,
|
|
+ struct kobj_attribute *a, char *buf)
|
|
+{
|
|
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
|
|
+ devid_kobj);
|
|
+
|
|
+ return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type);
|
|
+}
|
|
+BTRFS_ATTR(devid, type, btrfs_devinfo_type_show);
|
|
+
|
|
/*
|
|
* Information about one device.
|
|
*
|
|
@@ -1985,6 +1995,7 @@ static struct attribute *devid_attrs[] = {
|
|
BTRFS_ATTR_PTR(devid, replace_target),
|
|
BTRFS_ATTR_PTR(devid, scrub_speed_max),
|
|
BTRFS_ATTR_PTR(devid, writeable),
|
|
+ BTRFS_ATTR_PTR(devid, type),
|
|
NULL
|
|
};
|
|
ATTRIBUTE_GROUPS(devid);
|
|
|
|
From 29637f2e3a69fe77a8097bd772a8a7803b9ec576 Mon Sep 17 00:00:00 2001
|
|
From: Goffredo Baroncelli <kreijack@inwind.it>
|
|
Date: Sun, 24 Oct 2021 17:31:06 +0200
|
|
Subject: [PATCH 03/25] btrfs: change the DEV_ITEM 'type' field via sysfs
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/sysfs.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-
|
|
fs/btrfs/volumes.c | 2 +-
|
|
fs/btrfs/volumes.h | 2 ++
|
|
3 files changed, 58 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index fe07a7cbcf74c4..3675d961b39a2a 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1980,7 +1980,61 @@ static ssize_t btrfs_devinfo_type_show(struct kobject *kobj,
|
|
|
|
return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type);
|
|
}
|
|
-BTRFS_ATTR(devid, type, btrfs_devinfo_type_show);
|
|
+
|
|
+static ssize_t btrfs_devinfo_type_store(struct kobject *kobj,
|
|
+ struct kobj_attribute *a,
|
|
+ const char *buf, size_t len)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info;
|
|
+ struct btrfs_root *root;
|
|
+ struct btrfs_device *device;
|
|
+ int ret;
|
|
+ struct btrfs_trans_handle *trans;
|
|
+
|
|
+ u64 type, prev_type;
|
|
+
|
|
+ device = container_of(kobj, struct btrfs_device, devid_kobj);
|
|
+ fs_info = device->fs_info;
|
|
+ if (!fs_info)
|
|
+ return -EPERM;
|
|
+
|
|
+ root = fs_info->chunk_root;
|
|
+ if (sb_rdonly(fs_info->sb))
|
|
+ return -EROFS;
|
|
+
|
|
+ ret = kstrtou64(buf, 0, &type);
|
|
+ if (ret < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ /* for now, allow to touch only the 'allocation hint' bits */
|
|
+ if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1))
|
|
+ return -EINVAL;
|
|
+
|
|
+ trans = btrfs_start_transaction(root, 1);
|
|
+ if (IS_ERR(trans))
|
|
+ return PTR_ERR(trans);
|
|
+
|
|
+ prev_type = device->type;
|
|
+ device->type = type;
|
|
+
|
|
+ ret = btrfs_update_device(trans, device);
|
|
+
|
|
+ if (ret < 0) {
|
|
+ btrfs_abort_transaction(trans, ret);
|
|
+ btrfs_end_transaction(trans);
|
|
+ goto abort;
|
|
+ }
|
|
+
|
|
+ ret = btrfs_commit_transaction(trans);
|
|
+ if (ret < 0)
|
|
+ goto abort;
|
|
+
|
|
+ return len;
|
|
+abort:
|
|
+ device->type = prev_type;
|
|
+ return ret;
|
|
+}
|
|
+BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store);
|
|
|
|
/*
|
|
* Information about one device.
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index eb51b609190fb5..620a9ea74e7558 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -2882,7 +2882,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
|
|
return ret;
|
|
}
|
|
|
|
-static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
|
|
+noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
|
|
struct btrfs_device *device)
|
|
{
|
|
int ret;
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index 4481575dd70f35..7bb14d51bffc58 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -836,6 +836,8 @@ int btrfs_bg_type_to_factor(u64 flags);
|
|
const char *btrfs_bg_type_to_raid_name(u64 flags);
|
|
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
|
|
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
|
|
+int btrfs_update_device(struct btrfs_trans_handle *trans,
|
|
+ struct btrfs_device *device);
|
|
|
|
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
|
|
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
|
|
|
|
From 970b99e160487e9765b6e7db9f8a89a96ce79811 Mon Sep 17 00:00:00 2001
|
|
From: Goffredo Baroncelli <kreijack@inwind.it>
|
|
Date: Sun, 24 Oct 2021 17:31:07 +0200
|
|
Subject: [PATCH 04/25] btrfs: add allocator_hint mode
|
|
|
|
When this mode is enabled, the chunk allocation policy is modified as
|
|
follow.
|
|
|
|
Each disk may have a different tag:
|
|
- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA
|
|
- BTRFS_DEV_ALLOCATION_METADATA_ONLY
|
|
- BTRFS_DEV_ALLOCATION_DATA_ONLY
|
|
- BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default)
|
|
|
|
Where:
|
|
- ALLOCATION_PREFERRED_X means that it is preferred to use this disk for
|
|
the X chunk type (the other type may be allowed when the space is low)
|
|
- ALLOCATION_X_ONLY means that it is used *only* for the X chunk type.
|
|
This means also that it is a preferred choice.
|
|
|
|
Each time the allocator allocates a chunk of type X , first it takes the
|
|
disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X; if the space
|
|
is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY;
|
|
if the space is not enough, it uses also the other disks, with the
|
|
exception of the one marked as ALLOCATION_PREFERRED_Y, where Y the other
|
|
type of chunk (i.e. not X).
|
|
|
|
Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
|
|
---
|
|
fs/btrfs/volumes.c | 97 +++++++++++++++++++++++++++++++++++++++++++++-
|
|
fs/btrfs/volumes.h | 1 +
|
|
2 files changed, 97 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index 620a9ea74e7558..e66700fc8dcd4e 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -184,6 +184,19 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags
|
|
return BTRFS_BG_FLAG_TO_INDEX(profile);
|
|
}
|
|
|
|
+#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \
|
|
+ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)
|
|
+#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \
|
|
+ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT)
|
|
+
|
|
+static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = {
|
|
+ [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1,
|
|
+ [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0,
|
|
+ [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1,
|
|
+ [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2,
|
|
+ /* the other values are set to 0 */
|
|
+};
|
|
+
|
|
const char *btrfs_bg_type_to_raid_name(u64 flags)
|
|
{
|
|
const int index = btrfs_bg_flags_to_raid_index(flags);
|
|
@@ -5022,13 +5035,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
|
|
}
|
|
|
|
/*
|
|
- * sort the devices in descending order by max_avail, total_avail
|
|
+ * sort the devices in descending order by alloc_hint,
|
|
+ * max_avail, total_avail
|
|
*/
|
|
static int btrfs_cmp_device_info(const void *a, const void *b)
|
|
{
|
|
const struct btrfs_device_info *di_a = a;
|
|
const struct btrfs_device_info *di_b = b;
|
|
|
|
+ if (di_a->alloc_hint > di_b->alloc_hint)
|
|
+ return -1;
|
|
+ if (di_a->alloc_hint < di_b->alloc_hint)
|
|
+ return 1;
|
|
if (di_a->max_avail > di_b->max_avail)
|
|
return -1;
|
|
if (di_a->max_avail < di_b->max_avail)
|
|
@@ -5181,6 +5199,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
|
|
int ndevs = 0;
|
|
u64 max_avail;
|
|
u64 dev_offset;
|
|
+ int hint;
|
|
+ int i;
|
|
|
|
/*
|
|
* in the first pass through the devices list, we gather information
|
|
@@ -5233,16 +5253,91 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
|
|
devices_info[ndevs].max_avail = max_avail;
|
|
devices_info[ndevs].total_avail = total_avail;
|
|
devices_info[ndevs].dev = device;
|
|
+
|
|
+ if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) &&
|
|
+ (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) {
|
|
+ /*
|
|
+ * if mixed bg set all the alloc_hint
|
|
+ * fields to the same value, so the sorting
|
|
+ * is not affected
|
|
+ */
|
|
+ devices_info[ndevs].alloc_hint = 0;
|
|
+ } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
|
|
+ hint = device->type & BTRFS_DEV_ALLOCATION_MASK;
|
|
+
|
|
+ /*
|
|
+ * skip BTRFS_DEV_METADATA_ONLY disks
|
|
+ */
|
|
+ if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY)
|
|
+ continue;
|
|
+ /*
|
|
+ * if a data chunk must be allocated,
|
|
+ * sort also by hint (data disk
|
|
+ * higher priority)
|
|
+ */
|
|
+ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint];
|
|
+ } else { /* BTRFS_BLOCK_GROUP_METADATA */
|
|
+ hint = device->type & BTRFS_DEV_ALLOCATION_MASK;
|
|
+
|
|
+ /*
|
|
+ * skip BTRFS_DEV_DATA_ONLY disks
|
|
+ */
|
|
+ if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY)
|
|
+ continue;
|
|
+ /*
|
|
+ * if a data chunk must be allocated,
|
|
+ * sort also by hint (metadata hint
|
|
+ * higher priority)
|
|
+ */
|
|
+ devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
|
|
+ }
|
|
+
|
|
++ndevs;
|
|
}
|
|
ctl->ndevs = ndevs;
|
|
|
|
+ /*
|
|
+ * no devices available
|
|
+ */
|
|
+ if (!ndevs)
|
|
+ return 0;
|
|
+
|
|
/*
|
|
* now sort the devices by hole size / available space
|
|
*/
|
|
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
|
|
btrfs_cmp_device_info, NULL);
|
|
|
|
+ /*
|
|
+ * select the minimum set of disks grouped by hint that
|
|
+ * can host the chunk
|
|
+ */
|
|
+ ndevs = 0;
|
|
+ while (ndevs < ctl->ndevs) {
|
|
+ hint = devices_info[ndevs++].alloc_hint;
|
|
+ while (ndevs < ctl->ndevs &&
|
|
+ devices_info[ndevs].alloc_hint == hint)
|
|
+ ndevs++;
|
|
+ if (ndevs >= ctl->devs_min)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ BUG_ON(ndevs > ctl->ndevs);
|
|
+ ctl->ndevs = ndevs;
|
|
+
|
|
+ /*
|
|
+ * the next layers require the devices_info ordered by
|
|
+ * max_avail. If we are returing two (or more) different
|
|
+ * group of alloc_hint, this is not always true. So sort
|
|
+ * these gain.
|
|
+ */
|
|
+
|
|
+ for (i = 0 ; i < ndevs ; i++)
|
|
+ devices_info[i].alloc_hint = 0;
|
|
+
|
|
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
|
|
+ btrfs_cmp_device_info, NULL);
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index 7bb14d51bffc58..f3c5437e270a22 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -565,6 +565,7 @@ struct btrfs_device_info {
|
|
u64 dev_offset;
|
|
u64 max_avail;
|
|
u64 total_avail;
|
|
+ int alloc_hint;
|
|
};
|
|
|
|
struct btrfs_raid_attr {
|
|
|
|
From 1c1f2e27d3055b7721468c6980479a043f48e2b3 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kk@netactive.de>
|
|
Date: Thu, 27 Jun 2024 20:05:58 +0200
|
|
Subject: [PATCH 05/25] btrfs: add allocator_hint for no allocation preferred
|
|
|
|
This is useful where you want to prevent new allocations of chunks on a
|
|
disk which is going to removed from the pool anyways, e.g. due to bad
|
|
blocks or because it's slow.
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 6 +++++-
|
|
include/uapi/linux/btrfs_tree.h | 2 ++
|
|
2 files changed, 7 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index e66700fc8dcd4e..c6aa93fae9aa65 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -194,6 +194,7 @@ static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = {
|
|
[BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0,
|
|
[BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1,
|
|
[BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2,
|
|
+ [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99,
|
|
/* the other values are set to 0 */
|
|
};
|
|
|
|
@@ -5289,7 +5290,10 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
|
|
* sort also by hint (metadata hint
|
|
* higher priority)
|
|
*/
|
|
- devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
|
|
+ if (hint == BTRFS_DEV_ALLOCATION_PREFERRED_NONE)
|
|
+ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint];
|
|
+ else
|
|
+ devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
|
|
}
|
|
|
|
++ndevs;
|
|
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
|
|
index 71c6135dc7cfb2..92bcc59b129a97 100644
|
|
--- a/include/uapi/linux/btrfs_tree.h
|
|
+++ b/include/uapi/linux/btrfs_tree.h
|
|
@@ -590,6 +590,8 @@ struct btrfs_node {
|
|
#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL)
|
|
/* only data chunk allowed */
|
|
#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL)
|
|
+/* preferred no chunk, but chunks allowed */
|
|
+#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL)
|
|
/* 5..7 are unused values */
|
|
|
|
struct btrfs_dev_item {
|
|
|
|
From 82553effe6b655f97478b6d13df7ab0ecc192e58 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Fri, 6 Dec 2024 00:55:31 +0100
|
|
Subject: [PATCH 06/25] btrfs: add allocator_hint to disable allocation
|
|
completely
|
|
|
|
This is useful where you want to prevent new allocations of chunks to
|
|
a set of multiple disks which are going to be removed from the pool.
|
|
This acts as a multiple `btrfs dev remove` on steroids that can remove
|
|
multiple disks in parallel without moving data to disks which would be
|
|
removed in the next round. In such cases, it will avoid moving the
|
|
same data multiple times, and thus avoid placing it on potentially bad
|
|
disks.
|
|
|
|
Thanks to @Zygo for the explanation and suggestion.
|
|
|
|
Link: https://github.com/kdave/btrfs-progs/issues/907#issuecomment-2520897104
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 11 +++++++++++
|
|
include/uapi/linux/btrfs_tree.h | 4 +++-
|
|
2 files changed, 14 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index c6aa93fae9aa65..99d2c60ac2bf3e 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -190,6 +190,7 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags
|
|
BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT)
|
|
|
|
static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = {
|
|
+ [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99,
|
|
[BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1,
|
|
[BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0,
|
|
[BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1,
|
|
@@ -5271,6 +5272,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
|
|
*/
|
|
if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY)
|
|
continue;
|
|
+ /*
|
|
+ * skip BTRFS_DEV_NONE_ONLY disks
|
|
+ */
|
|
+ if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY)
|
|
+ continue;
|
|
/*
|
|
* if a data chunk must be allocated,
|
|
* sort also by hint (data disk
|
|
@@ -5285,6 +5291,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
|
|
*/
|
|
if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY)
|
|
continue;
|
|
+ /*
|
|
+ * skip BTRFS_DEV_NONE_ONLY disks
|
|
+ */
|
|
+ if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY)
|
|
+ continue;
|
|
/*
|
|
* if a data chunk must be allocated,
|
|
* sort also by hint (metadata hint
|
|
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
|
|
index 92bcc59b129a97..3db20734aacfc6 100644
|
|
--- a/include/uapi/linux/btrfs_tree.h
|
|
+++ b/include/uapi/linux/btrfs_tree.h
|
|
@@ -592,7 +592,9 @@ struct btrfs_node {
|
|
#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL)
|
|
/* preferred no chunk, but chunks allowed */
|
|
#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL)
|
|
-/* 5..7 are unused values */
|
|
+/* no chunks allowed */
|
|
+#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL)
|
|
+/* 6..7 are unused values */
|
|
|
|
struct btrfs_dev_item {
|
|
/* the internal btrfs device id */
|
|
|
|
From 10248db4c682397c83b99daa2de4ee0e587c0be2 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:31 +0800
|
|
Subject: [PATCH 07/25] btrfs: simplify output formatting in
|
|
btrfs_read_policy_show
|
|
|
|
Refactor the logic in btrfs_read_policy_show() to streamline the
|
|
formatting of read policies output. Streamline the space and bracket
|
|
handling around the active policy without altering the functional output.
|
|
This is in preparation to add more methods.
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/sysfs.c | 18 ++++++++++--------
|
|
1 file changed, 10 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 3675d961b39a2a..cde47f1c11757f 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1316,14 +1316,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
|
int i;
|
|
|
|
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
|
|
- if (policy == i)
|
|
- ret += sysfs_emit_at(buf, ret, "%s[%s]",
|
|
- (ret == 0 ? "" : " "),
|
|
- btrfs_read_policy_name[i]);
|
|
- else
|
|
- ret += sysfs_emit_at(buf, ret, "%s%s",
|
|
- (ret == 0 ? "" : " "),
|
|
- btrfs_read_policy_name[i]);
|
|
+ if (ret != 0)
|
|
+ ret += sysfs_emit_at(buf, ret, " ");
|
|
+
|
|
+ if (i == policy)
|
|
+ ret += sysfs_emit_at(buf, ret, "[");
|
|
+
|
|
+ ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
|
|
+
|
|
+ if (i == policy)
|
|
+ ret += sysfs_emit_at(buf, ret, "]");
|
|
}
|
|
|
|
ret += sysfs_emit_at(buf, ret, "\n");
|
|
|
|
From 4a49a279c14d9003fd7d4865706bc78142bf1645 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:30 +0800
|
|
Subject: [PATCH 08/25] btrfs: initialize fs_devices->fs_info earlier
|
|
|
|
Currently, fs_devices->fs_info is initialized in btrfs_init_devices_late(),
|
|
but this occurs too late for find_live_mirror(), which is invoked by
|
|
load_super_root() much earlier than btrfs_init_devices_late().
|
|
|
|
Fix this by moving the initialization to open_ctree(), before load_super_root().
|
|
|
|
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/disk-io.c | 1 +
|
|
fs/btrfs/volumes.c | 2 --
|
|
2 files changed, 1 insertion(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
|
|
index b11bfe68dd65fb..a4d2c5bcd93c52 100644
|
|
--- a/fs/btrfs/disk-io.c
|
|
+++ b/fs/btrfs/disk-io.c
|
|
@@ -3324,6 +3324,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
|
|
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
|
|
fs_info->stripesize = stripesize;
|
|
+ fs_info->fs_devices->fs_info = fs_info;
|
|
|
|
/*
|
|
* Handle the space caching options appropriately now that we have the
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index 99d2c60ac2bf3e..21cc02df8edf06 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -7577,8 +7577,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
|
|
struct btrfs_device *device;
|
|
int ret = 0;
|
|
|
|
- fs_devices->fs_info = fs_info;
|
|
-
|
|
mutex_lock(&fs_devices->device_list_mutex);
|
|
list_for_each_entry(device, &fs_devices->devices, dev_list)
|
|
device->fs_info = fs_info;
|
|
|
|
From ccb29226710d52abbd737fd0b2f438022c045af4 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:32 +0800
|
|
Subject: [PATCH 09/25] btrfs: add btrfs_read_policy_to_enum helper and
|
|
refactor read policy store
|
|
|
|
Introduce the `btrfs_read_policy_to_enum` helper function to simplify the
|
|
conversion of a string read policy to its corresponding enum value. This
|
|
reduces duplication and improves code clarity in `btrfs_read_policy_store`.
|
|
The `btrfs_read_policy_store` function has been refactored to use the new
|
|
helper.
|
|
|
|
The parameter is copied locally to allow modification, enabling the
|
|
separation of the method and its value. This prepares for the addition of
|
|
more functionality in subsequent patches.
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/sysfs.c | 34 ++++++++++++++++++++++------------
|
|
1 file changed, 22 insertions(+), 12 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index cde47f1c11757f..8540af0807648e 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1307,6 +1307,18 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
|
|
|
|
static const char * const btrfs_read_policy_name[] = { "pid" };
|
|
|
|
+static int btrfs_read_policy_to_enum(const char *str)
|
|
+{
|
|
+ char param[32] = {'\0'};
|
|
+
|
|
+ if (!str || strlen(str) == 0)
|
|
+ return 0;
|
|
+
|
|
+ strncpy(param, str, sizeof(param) - 1);
|
|
+
|
|
+ return sysfs_match_string(btrfs_read_policy_name, param);
|
|
+}
|
|
+
|
|
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
|
struct kobj_attribute *a, char *buf)
|
|
{
|
|
@@ -1338,21 +1350,19 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
|
const char *buf, size_t len)
|
|
{
|
|
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
|
|
- int i;
|
|
+ int index;
|
|
|
|
- for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
|
|
- if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
|
|
- if (i != READ_ONCE(fs_devices->read_policy)) {
|
|
- WRITE_ONCE(fs_devices->read_policy, i);
|
|
- btrfs_info(fs_devices->fs_info,
|
|
- "read policy set to '%s'",
|
|
- btrfs_read_policy_name[i]);
|
|
- }
|
|
- return len;
|
|
- }
|
|
+ index = btrfs_read_policy_to_enum(buf);
|
|
+ if (index < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (index != READ_ONCE(fs_devices->read_policy)) {
|
|
+ WRITE_ONCE(fs_devices->read_policy, index);
|
|
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
|
|
+ btrfs_read_policy_name[index]);
|
|
}
|
|
|
|
- return -EINVAL;
|
|
+ return len;
|
|
}
|
|
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
|
|
|
|
|
|
From cf73e9084375ab73182d3a2d510e878a137a9664 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:34 +0800
|
|
Subject: [PATCH 10/25] btrfs: add tracking of read blocks for read policy
|
|
|
|
Add fs_devices::read_cnt_blocks to track read blocks, initialize it in
|
|
open_fs_devices() and clean it up in close_fs_devices().
|
|
btrfs_submit_dev_bio() increments it for reads when stats tracking is
|
|
enabled. Stats tracking is disabled by default and is enabled through
|
|
fs_devices::fs_stats when required.
|
|
|
|
The code is not under the EXPERIMENTAL define, as stats can be expanded
|
|
to include write counts and other performance counters, with the user
|
|
interface independent of its internal use.
|
|
|
|
This is an in-memory-only feature, different to the dev error stats.
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/bio.c | 8 ++++++++
|
|
fs/btrfs/disk-io.c | 5 +++++
|
|
fs/btrfs/fs.h | 3 +++
|
|
fs/btrfs/volumes.c | 2 +-
|
|
fs/btrfs/volumes.h | 4 +++-
|
|
5 files changed, 20 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
|
|
index 7e0f9600b80c43..7583a9b74e22b1 100644
|
|
--- a/fs/btrfs/bio.c
|
|
+++ b/fs/btrfs/bio.c
|
|
@@ -450,6 +450,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
|
|
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
|
|
dev->devid, bio->bi_iter.bi_size);
|
|
|
|
+ /*
|
|
+ * Track reads if tracking is enabled; ignore I/O operations before
|
|
+ * fully initialized.
|
|
+ */
|
|
+ if (dev->fs_devices->fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
|
|
+ percpu_counter_add(&dev->fs_info->stats_read_blocks,
|
|
+ bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
|
|
+
|
|
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
|
|
blkcg_punt_bio_submit(bio);
|
|
else
|
|
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
|
|
index a4d2c5bcd93c52..277490cc5ae24d 100644
|
|
--- a/fs/btrfs/disk-io.c
|
|
+++ b/fs/btrfs/disk-io.c
|
|
@@ -1259,6 +1259,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
|
|
{
|
|
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
|
|
|
|
+ percpu_counter_destroy(&fs_info->stats_read_blocks);
|
|
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
|
|
percpu_counter_destroy(&fs_info->delalloc_bytes);
|
|
percpu_counter_destroy(&fs_info->ordered_bytes);
|
|
@@ -2858,6 +2859,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
|
|
if (ret)
|
|
return ret;
|
|
|
|
+ ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
fs_info->dirty_metadata_batch = PAGE_SIZE *
|
|
(1 + ilog2(nr_cpu_ids));
|
|
|
|
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
|
|
index 79f64e383eddf8..8960e141886b3e 100644
|
|
--- a/fs/btrfs/fs.h
|
|
+++ b/fs/btrfs/fs.h
|
|
@@ -625,6 +625,9 @@ struct btrfs_fs_info {
|
|
struct kobject *qgroups_kobj;
|
|
struct kobject *discard_kobj;
|
|
|
|
+ /* Track the number of blocks (sectors) read by the filesystem. */
|
|
+ struct percpu_counter stats_read_blocks;
|
|
+
|
|
/* Used to keep from writing metadata until there is a nice batch */
|
|
struct percpu_counter dirty_metadata_bytes;
|
|
struct percpu_counter delalloc_bytes;
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index 21cc02df8edf06..df4dfdfce22a52 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -7678,7 +7678,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
|
|
list_for_each_entry(device, &fs_devices->devices, dev_list) {
|
|
ret = btrfs_device_init_dev_stats(device, path);
|
|
if (ret)
|
|
- goto out;
|
|
+ return ret;
|
|
}
|
|
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
|
|
list_for_each_entry(device, &seed_devs->devices, dev_list) {
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index f3c5437e270a22..91a2358b74c91f 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -185,7 +185,7 @@ struct btrfs_device {
|
|
* enum btrfs_dev_stat_values in ioctl.h */
|
|
int dev_stats_valid;
|
|
|
|
- /* Counter to record the change of device stats */
|
|
+ /* Counter to record of the change of device stats */
|
|
atomic_t dev_stats_ccnt;
|
|
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
|
|
|
|
@@ -417,6 +417,8 @@ struct btrfs_fs_devices {
|
|
bool seeding;
|
|
/* The mount needs to use a randomly generated fsid. */
|
|
bool temp_fsid;
|
|
+ /* Enable/disable the filesystem stats tracking */
|
|
+ bool fs_stats;
|
|
|
|
struct btrfs_fs_info *fs_info;
|
|
/* sysfs kobjects */
|
|
|
|
From 7070070e90e889d165590aa05f02e671d041d12c Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Mon, 16 Sep 2024 18:18:25 +0930
|
|
Subject: [PATCH 11/25] btrfs: introduce CONFIG_BTRFS_EXPERIMENTAL from 6.13
|
|
|
|
CONFIG_BTRFS_EXPERIMENTAL is needed by the RAID1 balancing patches but
|
|
we don't want to use the full scope of the 6.13 patch because it also
|
|
affects features currently masked via CONFIG_BTRFS_DEBUG.
|
|
|
|
TODO: Drop during rebase to 6.13 or later.
|
|
Original-author: Qu Wenruo <wqu@suse.com>
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/Kconfig | 9 +++++++++
|
|
1 file changed, 9 insertions(+)
|
|
|
|
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
|
|
index 4fb925e8c981d8..ead317f1eeb859 100644
|
|
--- a/fs/btrfs/Kconfig
|
|
+++ b/fs/btrfs/Kconfig
|
|
@@ -78,6 +78,15 @@ config BTRFS_ASSERT
|
|
|
|
If unsure, say N.
|
|
|
|
+config BTRFS_EXPERIMENTAL
|
|
+ bool "Btrfs experimental features"
|
|
+ depends on BTRFS_FS
|
|
+ help
|
|
+ Enable experimental features. These features may not be stable enough
|
|
+ for end users. This is meant for btrfs developers only.
|
|
+
|
|
+ If unsure, say N.
|
|
+
|
|
config BTRFS_FS_REF_VERIFY
|
|
bool "Btrfs with the ref verify tool compiled in"
|
|
depends on BTRFS_FS
|
|
|
|
From 3efa6c755e4ae0dc36f606b329b10587f24dcab3 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:33 +0800
|
|
Subject: [PATCH 12/25] btrfs: handle value associated with read policy
|
|
parameter
|
|
|
|
This change enables specifying additional configuration values alongside
|
|
the read policy in a single input string.
|
|
|
|
Updated btrfs_read_policy_to_enum() to parse and handle a value associated
|
|
with the policy in the format `policy:value`, the value part if present is
|
|
converted 64-bit integer. Update btrfs_read_policy_store() to accommodate
|
|
the new parameter.
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/sysfs.c | 16 ++++++++++++++--
|
|
1 file changed, 14 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 8540af0807648e..b0e624c0598f48 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1307,15 +1307,26 @@ BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
|
|
|
|
static const char * const btrfs_read_policy_name[] = { "pid" };
|
|
|
|
-static int btrfs_read_policy_to_enum(const char *str)
|
|
+static int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
{
|
|
char param[32] = {'\0'};
|
|
+ char *__maybe_unused value_str;
|
|
|
|
if (!str || strlen(str) == 0)
|
|
return 0;
|
|
|
|
strncpy(param, str, sizeof(param) - 1);
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ /* Separate value from input in policy:value format. */
|
|
+ if ((value_str = strchr(param, ':'))) {
|
|
+ *value_str = '\0';
|
|
+ value_str++;
|
|
+ if (value && kstrtou64(value_str, 10, value) != 0)
|
|
+ return -EINVAL;
|
|
+ }
|
|
+#endif
|
|
+
|
|
return sysfs_match_string(btrfs_read_policy_name, param);
|
|
}
|
|
|
|
@@ -1351,8 +1362,9 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
|
{
|
|
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
|
|
int index;
|
|
+ s64 value = -1;
|
|
|
|
- index = btrfs_read_policy_to_enum(buf);
|
|
+ index = btrfs_read_policy_to_enum(buf, &value);
|
|
if (index < 0)
|
|
return -EINVAL;
|
|
|
|
|
|
From 687cdc03a694afb2236c7c87de458c519be771ea Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:35 +0800
|
|
Subject: [PATCH 13/25] btrfs: introduce round-robin read policy
|
|
|
|
This feature balances I/O across the striped devices when reading from
|
|
mirrored blocks.
|
|
|
|
echo round-robin[:min_contig_read] > /sys/fs/btrfs/<uuid>/read_policy
|
|
|
|
The min_contig_read parameter defines the minimum read size before
|
|
switching to the next mirrored device. This setting is optional, with a
|
|
default value of 256KiB.
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/sysfs.c | 49 ++++++++++++++++++++++++++++++-
|
|
fs/btrfs/volumes.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++
|
|
fs/btrfs/volumes.h | 11 +++++++
|
|
3 files changed, 131 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index b0e624c0598f48..f3a696ad122965 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
|
|
}
|
|
BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
|
|
|
|
-static const char * const btrfs_read_policy_name[] = { "pid" };
|
|
+static const char *btrfs_read_policy_name[] = {
|
|
+ "pid",
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ "round-robin",
|
|
+#endif
|
|
+};
|
|
|
|
static int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
{
|
|
@@ -1347,6 +1352,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
|
|
|
ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ if (i == BTRFS_READ_POLICY_RR)
|
|
+ ret += sysfs_emit_at(buf, ret, ":%d",
|
|
+ READ_ONCE(fs_devices->rr_min_contig_read));
|
|
+#endif
|
|
+
|
|
if (i == policy)
|
|
ret += sysfs_emit_at(buf, ret, "]");
|
|
}
|
|
@@ -1368,6 +1379,42 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
|
if (index < 0)
|
|
return -EINVAL;
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ /* If moving out of RR then disable fs_stats */
|
|
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR &&
|
|
+ index != BTRFS_READ_POLICY_RR)
|
|
+ fs_devices->fs_stats = false;
|
|
+
|
|
+ if (index == BTRFS_READ_POLICY_RR) {
|
|
+ if (value != -1) {
|
|
+ u32 sectorsize = fs_devices->fs_info->sectorsize;
|
|
+
|
|
+ if (!IS_ALIGNED(value, sectorsize)) {
|
|
+ u64 temp_value = round_up(value, sectorsize);
|
|
+
|
|
+ btrfs_warn(fs_devices->fs_info,
|
|
+"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
|
|
+ value, sectorsize, temp_value);
|
|
+ value = temp_value;
|
|
+ }
|
|
+ } else {
|
|
+ value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
|
|
+ }
|
|
+
|
|
+ if (index != READ_ONCE(fs_devices->read_policy) ||
|
|
+ value != READ_ONCE(fs_devices->rr_min_contig_read)) {
|
|
+ WRITE_ONCE(fs_devices->read_policy, index);
|
|
+ WRITE_ONCE(fs_devices->rr_min_contig_read, value);
|
|
+
|
|
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
|
|
+ btrfs_read_policy_name[index], value);
|
|
+ }
|
|
+
|
|
+ fs_devices->fs_stats = true;
|
|
+
|
|
+ return len;
|
|
+ }
|
|
+#endif
|
|
if (index != READ_ONCE(fs_devices->read_policy)) {
|
|
WRITE_ONCE(fs_devices->read_policy, index);
|
|
btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index df4dfdfce22a52..e5527ee145c2af 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -1235,6 +1235,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
|
|
fs_devices->total_rw_bytes = 0;
|
|
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
|
|
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
|
|
+#endif
|
|
|
|
return 0;
|
|
}
|
|
@@ -5970,6 +5973,70 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
|
|
return ret;
|
|
}
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+struct stripe_mirror {
|
|
+ u64 devid;
|
|
+ int num;
|
|
+};
|
|
+
|
|
+static int btrfs_cmp_devid(const void *a, const void *b)
|
|
+{
|
|
+ const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
|
|
+ const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
|
|
+
|
|
+ if (s1->devid < s2->devid)
|
|
+ return -1;
|
|
+ if (s1->devid > s2->devid)
|
|
+ return 1;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * btrfs_read_rr.
|
|
+ *
|
|
+ * Select a stripe for reading using a round-robin algorithm:
|
|
+ *
|
|
+ * 1. Compute the read cycle as the total sectors read divided by the minimum
|
|
+ * sectors per device.
|
|
+ * 2. Determine the stripe number for the current read by taking the modulus
|
|
+ * of the read cycle with the total number of stripes:
|
|
+ *
|
|
+ * stripe index = (total sectors / min sectors per dev) % num stripes
|
|
+ *
|
|
+ * The calculated stripe index is then used to select the corresponding device
|
|
+ * from the list of devices, which is ordered by devid.
|
|
+ */
|
|
+static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
|
|
+{
|
|
+ struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
|
|
+ struct btrfs_device *device = map->stripes[first].dev;
|
|
+ struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
|
|
+ int read_cycle;
|
|
+ int index;
|
|
+ int ret_stripe;
|
|
+ int total_reads;
|
|
+ int min_reads_per_dev;
|
|
+
|
|
+ total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
|
|
+ min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
|
|
+ fs_info->sectorsize_bits;
|
|
+
|
|
+ index = 0;
|
|
+ for (int i = first; i < first + num_stripe; i++) {
|
|
+ stripes[index].devid = map->stripes[i].dev->devid;
|
|
+ stripes[index].num = i;
|
|
+ index++;
|
|
+ }
|
|
+ sort(stripes, num_stripe, sizeof(struct stripe_mirror),
|
|
+ btrfs_cmp_devid, NULL);
|
|
+
|
|
+ read_cycle = total_reads / min_reads_per_dev;
|
|
+ ret_stripe = stripes[read_cycle % num_stripe].num;
|
|
+
|
|
+ return ret_stripe;
|
|
+}
|
|
+#endif
|
|
+
|
|
static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
struct btrfs_chunk_map *map, int first,
|
|
int dev_replace_is_ongoing)
|
|
@@ -5999,6 +6066,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
case BTRFS_READ_POLICY_PID:
|
|
preferred_mirror = first + (current->pid % num_stripes);
|
|
break;
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ case BTRFS_READ_POLICY_RR:
|
|
+ preferred_mirror = btrfs_read_rr(map, first, num_stripes);
|
|
+ break;
|
|
+#endif
|
|
}
|
|
|
|
if (dev_replace_is_ongoing &&
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index 91a2358b74c91f..65d56bffc6ef8b 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy {
|
|
BTRFS_CHUNK_ALLOC_ZONED,
|
|
};
|
|
|
|
+#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K)
|
|
+#define BTRFS_RAID1_MAX_MIRRORS (4)
|
|
/*
|
|
* Read policies for mirrored block group profiles, read picks the stripe based
|
|
* on these policies.
|
|
@@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy {
|
|
enum btrfs_read_policy {
|
|
/* Use process PID to choose the stripe */
|
|
BTRFS_READ_POLICY_PID,
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ /* Balancing raid1 reads across all striped devices (round-robin) */
|
|
+ BTRFS_READ_POLICY_RR,
|
|
+#endif
|
|
BTRFS_NR_READ_POLICY,
|
|
};
|
|
|
|
@@ -432,6 +438,11 @@ struct btrfs_fs_devices {
|
|
/* Policy used to read the mirrored stripes. */
|
|
enum btrfs_read_policy read_policy;
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ /* Min contiguous reads before switching to next device. */
|
|
+ int rr_min_contig_read;
|
|
+#endif
|
|
+
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
/* Checksum mode - offload it or do it synchronously. */
|
|
enum btrfs_offload_csum_mode offload_csum_mode;
|
|
|
|
From 328002ad27e90dc8ff6b7c2022711b6f0df74a01 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:36 +0800
|
|
Subject: [PATCH 14/25] btrfs: add RAID1 preferred read device
|
|
|
|
When there's stale data on a mirrored device, this feature lets you choose
|
|
which device to read from. Mainly used for testing.
|
|
|
|
echo "devid:<devid-value>" > /sys/fs/btrfs/<UUID>/read_policy
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/sysfs.c | 33 ++++++++++++++++++++++++++++++++-
|
|
fs/btrfs/volumes.c | 21 +++++++++++++++++++++
|
|
fs/btrfs/volumes.h | 5 +++++
|
|
3 files changed, 58 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index f3a696ad122965..1a21a123c88d2d 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = {
|
|
"pid",
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
"round-robin",
|
|
+ "devid",
|
|
#endif
|
|
};
|
|
|
|
@@ -1356,8 +1357,11 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
|
if (i == BTRFS_READ_POLICY_RR)
|
|
ret += sysfs_emit_at(buf, ret, ":%d",
|
|
READ_ONCE(fs_devices->rr_min_contig_read));
|
|
-#endif
|
|
|
|
+ if (i == BTRFS_READ_POLICY_DEVID)
|
|
+ ret += sysfs_emit_at(buf, ret, ":%llu",
|
|
+ READ_ONCE(fs_devices->read_devid));
|
|
+#endif
|
|
if (i == policy)
|
|
ret += sysfs_emit_at(buf, ret, "]");
|
|
}
|
|
@@ -1414,6 +1418,33 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
|
|
|
return len;
|
|
}
|
|
+
|
|
+ if (index == BTRFS_READ_POLICY_DEVID) {
|
|
+
|
|
+ if (value != -1) {
|
|
+ BTRFS_DEV_LOOKUP_ARGS(args);
|
|
+
|
|
+ /* Validate input devid */
|
|
+ args.devid = value;
|
|
+ if (btrfs_find_device(fs_devices, &args) == NULL)
|
|
+ return -EINVAL;
|
|
+ } else {
|
|
+ /* Set default devid to the devid of the latest device */
|
|
+ value = fs_devices->latest_dev->devid;
|
|
+ }
|
|
+
|
|
+ if (index != READ_ONCE(fs_devices->read_policy) ||
|
|
+ (value != READ_ONCE(fs_devices->read_devid))) {
|
|
+ WRITE_ONCE(fs_devices->read_policy, index);
|
|
+ WRITE_ONCE(fs_devices->read_devid, value);
|
|
+
|
|
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'",
|
|
+ btrfs_read_policy_name[index], value);
|
|
+
|
|
+ }
|
|
+
|
|
+ return len;
|
|
+ }
|
|
#endif
|
|
if (index != READ_ONCE(fs_devices->read_policy)) {
|
|
WRITE_ONCE(fs_devices->read_policy, index);
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index e5527ee145c2af..a2a0af8f6a9f94 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -1237,6 +1237,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
|
|
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
|
|
+ fs_devices->read_devid = latest_dev->devid;
|
|
#endif
|
|
|
|
return 0;
|
|
@@ -5974,6 +5975,23 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
|
|
}
|
|
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
|
|
+ int num_stripe)
|
|
+{
|
|
+ int last = first + num_stripe;
|
|
+ int stripe_index;
|
|
+
|
|
+ for (stripe_index = first; stripe_index < last; stripe_index++) {
|
|
+ struct btrfs_device *device = map->stripes[stripe_index].dev;
|
|
+
|
|
+ if (device->devid == READ_ONCE(device->fs_devices->read_devid))
|
|
+ return stripe_index;
|
|
+ }
|
|
+
|
|
+ /* If no read-preferred device, use first stripe */
|
|
+ return first;
|
|
+}
|
|
+
|
|
struct stripe_mirror {
|
|
u64 devid;
|
|
int num;
|
|
@@ -6070,6 +6088,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
case BTRFS_READ_POLICY_RR:
|
|
preferred_mirror = btrfs_read_rr(map, first, num_stripes);
|
|
break;
|
|
+ case BTRFS_READ_POLICY_DEVID:
|
|
+ preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
|
|
+ break;
|
|
#endif
|
|
}
|
|
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index 65d56bffc6ef8b..d8075ad17a6d3a 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -308,6 +308,8 @@ enum btrfs_read_policy {
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
/* Balancing raid1 reads across all striped devices (round-robin) */
|
|
BTRFS_READ_POLICY_RR,
|
|
+ /* Read from the specific device */
|
|
+ BTRFS_READ_POLICY_DEVID,
|
|
#endif
|
|
BTRFS_NR_READ_POLICY,
|
|
};
|
|
@@ -441,6 +443,9 @@ struct btrfs_fs_devices {
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
/* Min contiguous reads before switching to next device. */
|
|
int rr_min_contig_read;
|
|
+
|
|
+ /* Device to be used for reading in case of RAID1. */
|
|
+ u64 read_devid;
|
|
#endif
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
From 5084cf69a0e706dfcae5e594d915e46a124fb25c Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:37 +0800
|
|
Subject: [PATCH 15/25] btrfs: expose experimental mode in module information
|
|
|
|
Commit c9c49e8f157e ("btrfs: split out CONFIG_BTRFS_EXPERIMENTAL from
|
|
CONFIG_BTRFS_DEBUG") introduces a way to enable or disable experimental
|
|
features, print its status during module load, like so:
|
|
|
|
Btrfs loaded, experimental=on, debug=on, assert=on, zoned=yes, fsverity=yes
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/super.c | 3 +++
|
|
1 file changed, 3 insertions(+)
|
|
|
|
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
|
|
index c64d0713412231..4742bb2af601a7 100644
|
|
--- a/fs/btrfs/super.c
|
|
+++ b/fs/btrfs/super.c
|
|
@@ -2468,6 +2468,9 @@ static __cold void btrfs_interface_exit(void)
|
|
static int __init btrfs_print_mod_info(void)
|
|
{
|
|
static const char options[] = ""
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ ", experimental=on"
|
|
+#endif
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
", debug=on"
|
|
#endif
|
|
|
|
From fd9d23cf84c07baec0ba5d4bbd9ecd4c0e671e47 Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:38 +0800
|
|
Subject: [PATCH 16/25] btrfs: enable read policy configuration via modprobe
|
|
parameter
|
|
|
|
This update allows configuring the `read_policy` methods using a
|
|
modprobe parameter when experimental mode CONFIG_BTRFS_EXPERIMENTAL
|
|
is enabled.
|
|
|
|
Examples:
|
|
|
|
- Set the RAID1 balancing method to round-robin with a custom
|
|
`min_contig_read` of 4k:
|
|
$ modprobe btrfs read_policy=round-robin:4096
|
|
|
|
- Set the round-robin balancing method with the default
|
|
`min_contig_read`:
|
|
$ modprobe btrfs read_policy=round-robin
|
|
|
|
- Set the `devid` balancing method, defaulting to the latest
|
|
device:
|
|
$ modprobe btrfs read_policy=devid
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/super.c | 5 +++++
|
|
fs/btrfs/sysfs.c | 30 +++++++++++++++++++++++++++++-
|
|
fs/btrfs/sysfs.h | 5 +++++
|
|
fs/btrfs/volumes.c | 14 +++++++++++++-
|
|
4 files changed, 52 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
|
|
index 4742bb2af601a7..448db8974cda70 100644
|
|
--- a/fs/btrfs/super.c
|
|
+++ b/fs/btrfs/super.c
|
|
@@ -2549,6 +2549,11 @@ static const struct init_sequence mod_init_seq[] = {
|
|
}, {
|
|
.init_func = extent_map_init,
|
|
.exit_func = extent_map_exit,
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ }, {
|
|
+ .init_func = btrfs_read_policy_init,
|
|
+ .exit_func = NULL,
|
|
+#endif
|
|
}, {
|
|
.init_func = ordered_data_init,
|
|
.exit_func = ordered_data_exit,
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 1a21a123c88d2d..3054e3378d394d 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1313,7 +1313,21 @@ static const char *btrfs_read_policy_name[] = {
|
|
#endif
|
|
};
|
|
|
|
-static int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+/* Global module configuration parameters */
|
|
+static char *read_policy;
|
|
+char *btrfs_get_mod_read_policy(void)
|
|
+{
|
|
+ return read_policy;
|
|
+}
|
|
+
|
|
+/* Set perm 0, disable sys/module/btrfs/parameter/read_policy interface */
|
|
+module_param(read_policy, charp, 0);
|
|
+MODULE_PARM_DESC(read_policy,
|
|
+"Global read policy; pid (default), round-robin[:min_contig_read], devid[:devid]");
|
|
+#endif
|
|
+
|
|
+int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
{
|
|
char param[32] = {'\0'};
|
|
char *__maybe_unused value_str;
|
|
@@ -1336,6 +1350,20 @@ static int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
return sysfs_match_string(btrfs_read_policy_name, param);
|
|
}
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+int __init btrfs_read_policy_init(void)
|
|
+{
|
|
+ s64 value;
|
|
+
|
|
+ if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) {
|
|
+ btrfs_err(NULL, "invalid read policy or value %s", read_policy);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
|
struct kobj_attribute *a, char *buf)
|
|
{
|
|
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
|
|
index e6a284c59809c9..e83efc44e30071 100644
|
|
--- a/fs/btrfs/sysfs.h
|
|
+++ b/fs/btrfs/sysfs.h
|
|
@@ -47,5 +47,10 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
|
|
int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
|
|
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
|
|
struct btrfs_qgroup *qgroup);
|
|
+int btrfs_read_policy_to_enum(const char *str, s64 *value);
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+int __init btrfs_read_policy_init(void);
|
|
+char *btrfs_get_mod_read_policy(void);
|
|
+#endif
|
|
|
|
#endif
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index a2a0af8f6a9f94..f61844fc2da9ab 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -1205,6 +1205,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
|
|
struct btrfs_device *device;
|
|
struct btrfs_device *latest_dev = NULL;
|
|
struct btrfs_device *tmp_device;
|
|
+ s64 __maybe_unused value = 0;
|
|
int ret = 0;
|
|
|
|
list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
|
|
@@ -1234,10 +1235,21 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
|
|
fs_devices->latest_dev = latest_dev;
|
|
fs_devices->total_rw_bytes = 0;
|
|
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
|
|
- fs_devices->read_policy = BTRFS_READ_POLICY_PID;
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
|
|
fs_devices->read_devid = latest_dev->devid;
|
|
+ fs_devices->read_policy =
|
|
+ btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), &value);
|
|
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
|
|
+ fs_devices->fs_stats = true;
|
|
+ if (value) {
|
|
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
|
|
+ fs_devices->rr_min_contig_read = value;
|
|
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID)
|
|
+ fs_devices->read_devid = value;
|
|
+ }
|
|
+#else
|
|
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
From 77f79e1f0d91253b9a2aa0ff975bf34ecf3d243e Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Thu, 2 Jan 2025 02:06:39 +0800
|
|
Subject: [PATCH 17/25] btrfs: modload to print read policy status
|
|
|
|
Modified the Btrfs loading message to include the read policy status
|
|
if the experimental feature is enabled.
|
|
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/super.c | 10 ++++++++++
|
|
1 file changed, 10 insertions(+)
|
|
|
|
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
|
|
index 448db8974cda70..ea5ff01881d706 100644
|
|
--- a/fs/btrfs/super.c
|
|
+++ b/fs/btrfs/super.c
|
|
@@ -2491,7 +2491,17 @@ static int __init btrfs_print_mod_info(void)
|
|
", fsverity=no"
|
|
#endif
|
|
;
|
|
+
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ if (btrfs_get_mod_read_policy() == NULL)
|
|
+ pr_info("Btrfs loaded%s\n", options);
|
|
+ else
|
|
+ pr_info("Btrfs loaded%s, read_policy=%s\n",
|
|
+ options, btrfs_get_mod_read_policy());
|
|
+#else
|
|
pr_info("Btrfs loaded%s\n", options);
|
|
+#endif
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
|
|
From ea9e632401927e9c38ae4b3e505fff377535f58b Mon Sep 17 00:00:00 2001
|
|
From: Anand Jain <anand.jain@oracle.com>
|
|
Date: Fri, 11 Oct 2024 10:49:17 +0800
|
|
Subject: [PATCH 18/25] btrfs: use the path with the lowest latency for RAID1
|
|
reads
|
|
|
|
This feature aims to direct the read I/O to the device with the lowest
|
|
known latency for reading RAID1 blocks.
|
|
|
|
echo "latency" > /sys/fs/btrfs/<UUID>/read_policy
|
|
|
|
Co-authored-by: Kai Krakow <kai@kaishome.de>
|
|
Signed-off-by: Anand Jain <anand.jain@oracle.com>
|
|
---
|
|
fs/btrfs/sysfs.c | 3 ++-
|
|
fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++++++++
|
|
fs/btrfs/volumes.h | 2 ++
|
|
3 files changed, 40 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 3054e3378d394d..fd096b83bb6c45 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1309,6 +1309,7 @@ static const char *btrfs_read_policy_name[] = {
|
|
"pid",
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
"round-robin",
|
|
+ "latency",
|
|
"devid",
|
|
#endif
|
|
};
|
|
@@ -1324,7 +1325,7 @@ char *btrfs_get_mod_read_policy(void)
|
|
/* Set perm 0, disable sys/module/btrfs/parameter/read_policy interface */
|
|
module_param(read_policy, charp, 0);
|
|
MODULE_PARM_DESC(read_policy,
|
|
-"Global read policy; pid (default), round-robin[:min_contig_read], devid[:devid]");
|
|
+"Global read policy; pid (default), round-robin[:min_contig_read], latency, devid[:devid]");
|
|
#endif
|
|
|
|
int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index f61844fc2da9ab..a36c2bfa339785 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -12,6 +12,9 @@
|
|
#include <linux/uuid.h>
|
|
#include <linux/list_sort.h>
|
|
#include <linux/namei.h>
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+#include <linux/part_stat.h>
|
|
+#endif
|
|
#include "misc.h"
|
|
#include "ctree.h"
|
|
#include "disk-io.h"
|
|
@@ -6004,6 +6007,35 @@ static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
|
|
return first;
|
|
}
|
|
|
|
+static int btrfs_best_stripe(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_chunk_map *map, int first,
|
|
+ int num_stripe)
|
|
+{
|
|
+ u64 best_wait = U64_MAX;
|
|
+ int best_stripe = 0;
|
|
+ int index;
|
|
+
|
|
+ for (index = first; index < first + num_stripe; index++) {
|
|
+ u64 read_wait;
|
|
+ u64 avg_wait = 0;
|
|
+ unsigned long read_ios;
|
|
+ struct btrfs_device *device = map->stripes[index].dev;
|
|
+
|
|
+ read_wait = part_stat_read(device->bdev, nsecs[READ]);
|
|
+ read_ios = part_stat_read(device->bdev, ios[READ]);
|
|
+
|
|
+ if (read_wait && read_ios && read_wait >= read_ios)
|
|
+ avg_wait = div_u64(read_wait, read_ios);
|
|
+
|
|
+ if (best_wait > avg_wait) {
|
|
+ best_wait = avg_wait;
|
|
+ best_stripe = index;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return best_stripe;
|
|
+}
|
|
+
|
|
struct stripe_mirror {
|
|
u64 devid;
|
|
int num;
|
|
@@ -6103,6 +6135,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
case BTRFS_READ_POLICY_DEVID:
|
|
preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
|
|
break;
|
|
+ case BTRFS_READ_POLICY_LATENCY:
|
|
+ preferred_mirror = btrfs_best_stripe(fs_info, map, first,
|
|
+ num_stripes);
|
|
+ break;
|
|
#endif
|
|
}
|
|
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index d8075ad17a6d3a..6c1f219f83b388 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -308,6 +308,8 @@ enum btrfs_read_policy {
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
/* Balancing raid1 reads across all striped devices (round-robin) */
|
|
BTRFS_READ_POLICY_RR,
|
|
+ /* Use the lowest-latency device dynamically */
|
|
+ BTRFS_READ_POLICY_LATENCY,
|
|
/* Read from the specific device */
|
|
BTRFS_READ_POLICY_DEVID,
|
|
#endif
|
|
|
|
From 680350c9732c58e321968974868836bf13ec5c96 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Wed, 9 Apr 2025 14:07:18 +0200
|
|
Subject: [PATCH 19/25] btrfs: move latency-based selection into helper
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 42 ++++++++++++++++++++++++++++++++----------
|
|
1 file changed, 32 insertions(+), 10 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index a36c2bfa339785..c2f235a02a79ea 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -6007,15 +6007,26 @@ static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
|
|
return first;
|
|
}
|
|
|
|
-static int btrfs_best_stripe(struct btrfs_fs_info *fs_info,
|
|
- struct btrfs_chunk_map *map, int first,
|
|
- int num_stripe)
|
|
+/*
|
|
+ * btrfs_best_stripe
|
|
+ *
|
|
+ * Select a stripe for reading using the average latency:
|
|
+ *
|
|
+ * 1. Compute the average latency of the device by dividing total latency
|
|
+ * by number of IOs.
|
|
+ * 2. Store minimum latency and selected stripe in best_wait / best_stripe.
|
|
+ *
|
|
+ * Will always find at least one stripe.
|
|
+ */
|
|
+static void btrfs_best_stripe(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_chunk_map *map, int first,
|
|
+ int num_stripes, u64 *best_wait, int *best_stripe)
|
|
{
|
|
- u64 best_wait = U64_MAX;
|
|
- int best_stripe = 0;
|
|
int index;
|
|
+ *best_wait = U64_MAX;
|
|
+ *best_stripe = 0;
|
|
|
|
- for (index = first; index < first + num_stripe; index++) {
|
|
+ for (index = first; index < first + num_stripes; index++) {
|
|
u64 read_wait;
|
|
u64 avg_wait = 0;
|
|
unsigned long read_ios;
|
|
@@ -6027,11 +6038,22 @@ static int btrfs_best_stripe(struct btrfs_fs_info *fs_info,
|
|
if (read_wait && read_ios && read_wait >= read_ios)
|
|
avg_wait = div_u64(read_wait, read_ios);
|
|
|
|
- if (best_wait > avg_wait) {
|
|
- best_wait = avg_wait;
|
|
- best_stripe = index;
|
|
+ if (*best_wait > avg_wait) {
|
|
+ *best_wait = avg_wait;
|
|
+ *best_stripe = index;
|
|
}
|
|
}
|
|
+}
|
|
+
|
|
+static int btrfs_read_fastest(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_chunk_map *map, int first,
|
|
+ int num_stripes)
|
|
+{
|
|
+ u64 best_wait;
|
|
+ int best_stripe;
|
|
+
|
|
+ btrfs_best_stripe(fs_info, map, first, num_stripes, &best_wait,
|
|
+ &best_stripe);
|
|
|
|
return best_stripe;
|
|
}
|
|
@@ -6136,7 +6158,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
|
|
break;
|
|
case BTRFS_READ_POLICY_LATENCY:
|
|
- preferred_mirror = btrfs_best_stripe(fs_info, map, first,
|
|
+ preferred_mirror = btrfs_read_fastest(fs_info, map, first,
|
|
num_stripes);
|
|
break;
|
|
#endif
|
|
|
|
From 1f255624630f889fbd9e268b8d7a77f5ed68fa8c Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Wed, 9 Apr 2025 15:21:14 +0200
|
|
Subject: [PATCH 20/25] btrfs: fix btrfs_read_rr to use the actual number of
|
|
stripes
|
|
|
|
While num_stripes is identical to index at the end of the loop, index
|
|
is really the correct number of indexed stripes for sorting. This
|
|
prepares the function to work with filtered sets of stripes.
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 4 ++--
|
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index c2f235a02a79ea..63384cd731ded2 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -6111,11 +6111,11 @@ static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
|
|
stripes[index].num = i;
|
|
index++;
|
|
}
|
|
- sort(stripes, num_stripe, sizeof(struct stripe_mirror),
|
|
+ sort(stripes, index, sizeof(struct stripe_mirror),
|
|
btrfs_cmp_devid, NULL);
|
|
|
|
read_cycle = total_reads / min_reads_per_dev;
|
|
- ret_stripe = stripes[read_cycle % num_stripe].num;
|
|
+ ret_stripe = stripes[read_cycle % index].num;
|
|
|
|
return ret_stripe;
|
|
}
|
|
|
|
From cbe1e71a4bb32092f0fe1cc251c2455bb8a37a78 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Tue, 15 Apr 2025 09:04:57 +0200
|
|
Subject: [PATCH 21/25] btrfs: create a helper instead of open coding device
|
|
latency calculation
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 27 ++++++++++++++-------------
|
|
1 file changed, 14 insertions(+), 13 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index 63384cd731ded2..7d47cb2e0b0411 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -6007,6 +6007,18 @@ static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
|
|
return first;
|
|
}
|
|
|
|
+static u64 btrfs_device_read_latency(struct btrfs_device *device)
|
|
+{
|
|
+ u64 read_wait = part_stat_read(device->bdev, nsecs[READ]);
|
|
+ unsigned long read_ios = part_stat_read(device->bdev, ios[READ]);
|
|
+ u64 avg_wait = 0;
|
|
+
|
|
+ if (read_wait && read_ios && read_wait >= read_ios)
|
|
+ avg_wait = div_u64(read_wait, read_ios);
|
|
+
|
|
+ return avg_wait;
|
|
+}
|
|
+
|
|
/*
|
|
* btrfs_best_stripe
|
|
*
|
|
@@ -6022,22 +6034,11 @@ static void btrfs_best_stripe(struct btrfs_fs_info *fs_info,
|
|
struct btrfs_chunk_map *map, int first,
|
|
int num_stripes, u64 *best_wait, int *best_stripe)
|
|
{
|
|
- int index;
|
|
*best_wait = U64_MAX;
|
|
*best_stripe = 0;
|
|
|
|
- for (index = first; index < first + num_stripes; index++) {
|
|
- u64 read_wait;
|
|
- u64 avg_wait = 0;
|
|
- unsigned long read_ios;
|
|
- struct btrfs_device *device = map->stripes[index].dev;
|
|
-
|
|
- read_wait = part_stat_read(device->bdev, nsecs[READ]);
|
|
- read_ios = part_stat_read(device->bdev, ios[READ]);
|
|
-
|
|
- if (read_wait && read_ios && read_wait >= read_ios)
|
|
- avg_wait = div_u64(read_wait, read_ios);
|
|
-
|
|
+ for (int index = first; index < first + num_stripes; index++) {
|
|
+ u64 avg_wait = btrfs_device_read_latency(map->stripes[index].dev);
|
|
if (*best_wait > avg_wait) {
|
|
*best_wait = avg_wait;
|
|
*best_stripe = index;
|
|
|
|
From 61994a4b9cb1e5cdaaba1276f95317a71a26a755 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Tue, 15 Apr 2025 01:28:06 +0200
|
|
Subject: [PATCH 22/25] btrfs: add filtering by latency to btrfs_read_rr
|
|
|
|
This introduces a new parameter to btrfs_read_rr to select whether we
|
|
filter for latency. In case the caller passes latency, we return -1 if
|
|
no stripe qualified.
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 20 +++++++++++++++++---
|
|
1 file changed, 17 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index 7d47cb2e0b0411..2e2d7059895d9a 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -6091,7 +6091,8 @@ static int btrfs_cmp_devid(const void *a, const void *b)
|
|
* The calculated stripe index is then used to select the corresponding device
|
|
* from the list of devices, which is ordered by devid.
|
|
*/
|
|
-static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
|
|
+static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripes,
|
|
+ u64 min_latency)
|
|
{
|
|
struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
|
|
struct btrfs_device *device = map->stripes[first].dev;
|
|
@@ -6107,11 +6108,24 @@ static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
|
|
fs_info->sectorsize_bits;
|
|
|
|
index = 0;
|
|
- for (int i = first; i < first + num_stripe; i++) {
|
|
+ for (int i = first; i < first + num_stripes; i++) {
|
|
+ if (min_latency > 0) {
|
|
+ u64 avg_wait = btrfs_device_read_latency(map->stripes[i].dev);
|
|
+ if (min_latency < avg_wait)
|
|
+ continue;
|
|
+ }
|
|
+
|
|
stripes[index].devid = map->stripes[i].dev->devid;
|
|
stripes[index].num = i;
|
|
index++;
|
|
}
|
|
+
|
|
+ /* if the caller passed a minimum latency and we filtered for no
|
|
+ * stripes, return -1 to indicate that no stripe qualified.
|
|
+ */
|
|
+ if (unlikely(min_latency && !index))
|
|
+ return -1;
|
|
+
|
|
sort(stripes, index, sizeof(struct stripe_mirror),
|
|
btrfs_cmp_devid, NULL);
|
|
|
|
@@ -6153,7 +6167,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
break;
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
case BTRFS_READ_POLICY_RR:
|
|
- preferred_mirror = btrfs_read_rr(map, first, num_stripes);
|
|
+ preferred_mirror = btrfs_read_rr(map, first, num_stripes, 0);
|
|
break;
|
|
case BTRFS_READ_POLICY_DEVID:
|
|
preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
|
|
|
|
From bd9761f9f70215bea4dd45789cbca084848da935 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Wed, 9 Apr 2025 15:59:59 +0200
|
|
Subject: [PATCH 23/25] btrfs: add hybrid latency-rr read policy
|
|
|
|
This mode combines latency and round-robin modes by considering all
|
|
stripes within 120% of the minimum latency. It falls back to round-robin
|
|
if all stripes have no latency recorded yet.
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/sysfs.c | 13 +++++++++++--
|
|
fs/btrfs/volumes.c | 38 ++++++++++++++++++++++++++++++++++++++
|
|
fs/btrfs/volumes.h | 2 ++
|
|
3 files changed, 51 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index fd096b83bb6c45..2014475af9716e 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1310,6 +1310,7 @@ static const char *btrfs_read_policy_name[] = {
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
"round-robin",
|
|
"latency",
|
|
+ "latency-rr",
|
|
"devid",
|
|
#endif
|
|
};
|
|
@@ -1325,7 +1326,7 @@ char *btrfs_get_mod_read_policy(void)
|
|
/* Set perm 0, disable sys/module/btrfs/parameter/read_policy interface */
|
|
module_param(read_policy, charp, 0);
|
|
MODULE_PARM_DESC(read_policy,
|
|
-"Global read policy; pid (default), round-robin[:min_contig_read], latency, devid[:devid]");
|
|
+"Global read policy; pid (default), round-robin[:min_contig_read], latency, latency-rr[:min_contig_read], devid[:devid]");
|
|
#endif
|
|
|
|
int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
|
@@ -1383,6 +1384,10 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
|
ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
|
|
|
|
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ if (i == BTRFS_READ_POLICY_LATENCY_RR)
|
|
+ ret += sysfs_emit_at(buf, ret, ":%d",
|
|
+ READ_ONCE(fs_devices->rr_min_contig_read));
|
|
+
|
|
if (i == BTRFS_READ_POLICY_RR)
|
|
ret += sysfs_emit_at(buf, ret, ":%d",
|
|
READ_ONCE(fs_devices->rr_min_contig_read));
|
|
@@ -1418,7 +1423,11 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
|
index != BTRFS_READ_POLICY_RR)
|
|
fs_devices->fs_stats = false;
|
|
|
|
- if (index == BTRFS_READ_POLICY_RR) {
|
|
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_LATENCY_RR &&
|
|
+ index != BTRFS_READ_POLICY_LATENCY_RR)
|
|
+ fs_devices->fs_stats = false;
|
|
+
|
|
+ if ((index == BTRFS_READ_POLICY_RR) || (index == BTRFS_READ_POLICY_LATENCY_RR)) {
|
|
if (value != -1) {
|
|
u32 sectorsize = fs_devices->fs_info->sectorsize;
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index 2e2d7059895d9a..d3ab0e62c96689 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -6134,6 +6134,40 @@ static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripes
|
|
|
|
return ret_stripe;
|
|
}
|
|
+
|
|
+/*
|
|
+ * btrfs_read_fastest_rr.
|
|
+ *
|
|
+ * Select a stripe for reading using a hybrid algorithm:
|
|
+ *
|
|
+ * 1. Determine the fastest stripe using btrfs_best_stripe.
|
|
+ * 2. Add 20% headroom to the selected latency.
|
|
+ * 3. Select a stripe using btrfs_read_rr filtered by latency.
|
|
+ */
|
|
+static int btrfs_read_fastest_rr(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_chunk_map *map, int first,
|
|
+ int num_stripes)
|
|
+{
|
|
+ u64 min_latency;
|
|
+ int ret_stripe = -1;
|
|
+
|
|
+ btrfs_best_stripe(fs_info, map, first, num_stripes, &min_latency,
|
|
+ &ret_stripe);
|
|
+
|
|
+ /* min_latency will be 0 if no latency has been recorded yet,
|
|
+ * add 20% headroom otherwise.
|
|
+ */
|
|
+ if (likely(min_latency)) {
|
|
+ min_latency = min_latency * 6;
|
|
+ min_latency = div_u64(min_latency, 5);
|
|
+ ret_stripe = btrfs_read_rr(map, first, num_stripes, min_latency);
|
|
+ }
|
|
+
|
|
+ if (unlikely(ret_stripe < 0))
|
|
+ ret_stripe = btrfs_read_rr(map, first, num_stripes, 0);
|
|
+
|
|
+ return ret_stripe;
|
|
+}
|
|
#endif
|
|
|
|
static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
@@ -6176,6 +6210,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
preferred_mirror = btrfs_read_fastest(fs_info, map, first,
|
|
num_stripes);
|
|
break;
|
|
+ case BTRFS_READ_POLICY_LATENCY_RR:
|
|
+ preferred_mirror = btrfs_read_fastest_rr(fs_info, map, first,
|
|
+ num_stripes);
|
|
+ break;
|
|
#endif
|
|
}
|
|
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index 6c1f219f83b388..a6e8a722d9c742 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -310,6 +310,8 @@ enum btrfs_read_policy {
|
|
BTRFS_READ_POLICY_RR,
|
|
/* Use the lowest-latency device dynamically */
|
|
BTRFS_READ_POLICY_LATENCY,
|
|
+ /* Use hybrid approach of lowest-latency and round-robin */
|
|
+ BTRFS_READ_POLICY_LATENCY_RR,
|
|
/* Read from the specific device */
|
|
BTRFS_READ_POLICY_DEVID,
|
|
#endif
|
|
|
|
From ec0168f2a941c8c995f828a281f9b4eabd891466 Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Tue, 15 Apr 2025 00:32:06 +0200
|
|
Subject: [PATCH 24/25] btrfs: add devinfo avg cumulative read latency to sysfs
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/sysfs.c | 24 ++++++++++++++++++++++++
|
|
1 file changed, 24 insertions(+)
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 2014475af9716e..adebb1324c9b1e 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -10,6 +10,9 @@
|
|
#include <linux/completion.h>
|
|
#include <linux/bug.h>
|
|
#include <linux/list.h>
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+#include <linux/part_stat.h>
|
|
+#endif
|
|
#include <crypto/hash.h>
|
|
#include "messages.h"
|
|
#include "ctree.h"
|
|
@@ -2176,12 +2179,33 @@ static ssize_t btrfs_devinfo_type_store(struct kobject *kobj,
|
|
}
|
|
BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store);
|
|
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+static ssize_t btrfs_devinfo_avg_read_latency_show(struct kobject *kobj,
|
|
+ struct kobj_attribute *a, char *buf)
|
|
+{
|
|
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
|
|
+ devid_kobj);
|
|
+ u64 read_wait = part_stat_read(device->bdev, nsecs[READ]);
|
|
+ unsigned long read_ios = part_stat_read(device->bdev, ios[READ]);
|
|
+
|
|
+ u64 avg_wait = 0;
|
|
+ if (read_wait && read_ios && read_wait >= read_ios)
|
|
+ avg_wait = div_u64(read_wait, read_ios);
|
|
+
|
|
+ return scnprintf(buf, PAGE_SIZE, "cum %llu\n", avg_wait);
|
|
+}
|
|
+BTRFS_ATTR(devid, avg_read_latency, btrfs_devinfo_avg_read_latency_show);
|
|
+#endif
|
|
+
|
|
/*
|
|
* Information about one device.
|
|
*
|
|
* Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
|
|
*/
|
|
static struct attribute *devid_attrs[] = {
|
|
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
|
+ BTRFS_ATTR_PTR(devid, avg_read_latency),
|
|
+#endif
|
|
BTRFS_ATTR_PTR(devid, error_stats),
|
|
BTRFS_ATTR_PTR(devid, fsid),
|
|
BTRFS_ATTR_PTR(devid, in_fs_metadata),
|
|
|
|
From 6535b1149f58a0b2da7df22743e1eedfbc03b87f Mon Sep 17 00:00:00 2001
|
|
From: Kai Krakow <kai@kaishome.de>
|
|
Date: Tue, 15 Apr 2025 04:42:16 +0200
|
|
Subject: [PATCH 25/25] btrfs: ignore latency early during the first IOs
|
|
|
|
Devices may be slow in this early phase and create spikes which most
|
|
likely disqualifies them for reading for the rest of the system
|
|
lifetime.
|
|
|
|
Signed-off-by: Kai Krakow <kai@kaishome.de>
|
|
---
|
|
fs/btrfs/volumes.c | 4 ++++
|
|
1 file changed, 4 insertions(+)
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index d3ab0e62c96689..72fd14c170393f 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -6007,12 +6007,16 @@ static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
|
|
return first;
|
|
}
|
|
|
|
+#define BTRFS_MIN_READ_IOS_FOR_VALID_LATENCY 100
|
|
static u64 btrfs_device_read_latency(struct btrfs_device *device)
|
|
{
|
|
u64 read_wait = part_stat_read(device->bdev, nsecs[READ]);
|
|
unsigned long read_ios = part_stat_read(device->bdev, ios[READ]);
|
|
u64 avg_wait = 0;
|
|
|
|
+ if (read_ios < BTRFS_MIN_READ_IOS_FOR_VALID_LATENCY)
|
|
+ return 0;
|
|
+
|
|
if (read_wait && read_ios && read_wait >= read_ios)
|
|
avg_wait = div_u64(read_wait, read_ios);
|
|
|