misc/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v1.patch

440 lines
14 KiB
Diff
Raw Normal View History

2024-12-01 09:56:36 +01:00
From 5e49c78f38cc7f5b7ec012021c8422c1db98ef7e Mon Sep 17 00:00:00 2001
From: Goffredo Baroncelli <kreijack@inwind.it>
Date: Sun, 24 Oct 2021 17:31:04 +0200
Subject: [PATCH 1/5] btrfs: add flags to give an hint to the chunk allocator
Add the following flags to give an hint about which chunk should be
allocated in which a disk.
The following flags are created:
- BTRFS_DEV_ALLOCATION_PREFERRED_DATA
preferred data chunk, but metadata chunk allowed
- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA
preferred metadata chunk, but data chunk allowed
- BTRFS_DEV_ALLOCATION_METADATA_ONLY
only metadata chunk allowed
- BTRFS_DEV_ALLOCATION_DATA_ONLY
only data chunk allowed
Signed-off-by: Goffredo Baroncelli <kreijack@inwid.it>
---
include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc29d273845d84..71c6135dc7cfb2 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -578,6 +578,20 @@ struct btrfs_node {
struct btrfs_key_ptr ptrs[];
} __attribute__ ((__packed__));
+/* dev_item.type */
+
+/* btrfs chunk allocation hints */
+#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3
+/* preferred data chunk, but metadata chunk allowed */
+#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL)
+/* preferred metadata chunk, but data chunk allowed */
+#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL)
+/* only metadata chunk are allowed */
+#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL)
+/* only data chunk allowed */
+#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL)
+/* 5..7 are unused values */
+
struct btrfs_dev_item {
/* the internal btrfs device id */
__le64 devid;
From 160344ae9ae37b32593adc43716172c37b0a734c Mon Sep 17 00:00:00 2001
From: Goffredo Baroncelli <kreijack@inwind.it>
Date: Sun, 24 Oct 2021 17:31:05 +0200
Subject: [PATCH 2/5] btrfs: export dev_item.type in
/sys/fs/btrfs/<uuid>/devinfo/<devid>/type
Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
---
fs/btrfs/sysfs.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 03926ad467c919..fe07a7cbcf74c4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1972,6 +1972,16 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+static ssize_t btrfs_devinfo_type_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type);
+}
+BTRFS_ATTR(devid, type, btrfs_devinfo_type_show);
+
/*
* Information about one device.
*
@@ -1985,6 +1995,7 @@ static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, replace_target),
BTRFS_ATTR_PTR(devid, scrub_speed_max),
BTRFS_ATTR_PTR(devid, writeable),
+ BTRFS_ATTR_PTR(devid, type),
NULL
};
ATTRIBUTE_GROUPS(devid);
From 29637f2e3a69fe77a8097bd772a8a7803b9ec576 Mon Sep 17 00:00:00 2001
From: Goffredo Baroncelli <kreijack@inwind.it>
Date: Sun, 24 Oct 2021 17:31:06 +0200
Subject: [PATCH 3/5] btrfs: change the DEV_ITEM 'type' field via sysfs
Signed-off-by: Kai Krakow <kai@kaishome.de>
---
fs/btrfs/sysfs.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-
fs/btrfs/volumes.c | 2 +-
fs/btrfs/volumes.h | 2 ++
3 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index fe07a7cbcf74c4..3675d961b39a2a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1980,7 +1980,61 @@ static ssize_t btrfs_devinfo_type_show(struct kobject *kobj,
return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type);
}
-BTRFS_ATTR(devid, type, btrfs_devinfo_type_show);
+
+static ssize_t btrfs_devinfo_type_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_device *device;
+ int ret;
+ struct btrfs_trans_handle *trans;
+
+ u64 type, prev_type;
+
+ device = container_of(kobj, struct btrfs_device, devid_kobj);
+ fs_info = device->fs_info;
+ if (!fs_info)
+ return -EPERM;
+
+ root = fs_info->chunk_root;
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtou64(buf, 0, &type);
+ if (ret < 0)
+ return -EINVAL;
+
+ /* for now, allow to touch only the 'allocation hint' bits */
+ if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1))
+ return -EINVAL;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ prev_type = device->type;
+ device->type = type;
+
+ ret = btrfs_update_device(trans, device);
+
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto abort;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ goto abort;
+
+ return len;
+abort:
+ device->type = prev_type;
+ return ret;
+}
+BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store);
/*
* Information about one device.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb51b609190fb5..620a9ea74e7558 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2882,7 +2882,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
}
-static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4481575dd70f35..7bb14d51bffc58 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -836,6 +836,8 @@ int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
From 970b99e160487e9765b6e7db9f8a89a96ce79811 Mon Sep 17 00:00:00 2001
From: Goffredo Baroncelli <kreijack@inwind.it>
Date: Sun, 24 Oct 2021 17:31:07 +0200
Subject: [PATCH 4/5] btrfs: add allocator_hint mode
When this mode is enabled, the chunk allocation policy is modified as
follow.
Each disk may have a different tag:
- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA
- BTRFS_DEV_ALLOCATION_METADATA_ONLY
- BTRFS_DEV_ALLOCATION_DATA_ONLY
- BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default)
Where:
- ALLOCATION_PREFERRED_X means that it is preferred to use this disk for
the X chunk type (the other type may be allowed when the space is low)
- ALLOCATION_X_ONLY means that it is used *only* for the X chunk type.
This means also that it is a preferred choice.
Each time the allocator allocates a chunk of type X , first it takes the
disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X; if the space
is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY;
if the space is not enough, it uses also the other disks, with the
exception of the one marked as ALLOCATION_PREFERRED_Y, where Y the other
type of chunk (i.e. not X).
Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
---
fs/btrfs/volumes.c | 97 +++++++++++++++++++++++++++++++++++++++++++++-
fs/btrfs/volumes.h | 1 +
2 files changed, 97 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 620a9ea74e7558..e66700fc8dcd4e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -184,6 +184,19 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags
return BTRFS_BG_FLAG_TO_INDEX(profile);
}
+#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \
+ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)
+#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \
+ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT)
+
+static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = {
+ [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1,
+ [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0,
+ [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1,
+ [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2,
+ /* the other values are set to 0 */
+};
+
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -5022,13 +5035,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
}
/*
- * sort the devices in descending order by max_avail, total_avail
+ * sort the devices in descending order by alloc_hint,
+ * max_avail, total_avail
*/
static int btrfs_cmp_device_info(const void *a, const void *b)
{
const struct btrfs_device_info *di_a = a;
const struct btrfs_device_info *di_b = b;
+ if (di_a->alloc_hint > di_b->alloc_hint)
+ return -1;
+ if (di_a->alloc_hint < di_b->alloc_hint)
+ return 1;
if (di_a->max_avail > di_b->max_avail)
return -1;
if (di_a->max_avail < di_b->max_avail)
@@ -5181,6 +5199,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
int ndevs = 0;
u64 max_avail;
u64 dev_offset;
+ int hint;
+ int i;
/*
* in the first pass through the devices list, we gather information
@@ -5233,16 +5253,91 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
+
+ if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) {
+ /*
+ * if mixed bg set all the alloc_hint
+ * fields to the same value, so the sorting
+ * is not affected
+ */
+ devices_info[ndevs].alloc_hint = 0;
+ } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
+ hint = device->type & BTRFS_DEV_ALLOCATION_MASK;
+
+ /*
+ * skip BTRFS_DEV_METADATA_ONLY disks
+ */
+ if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY)
+ continue;
+ /*
+ * if a data chunk must be allocated,
+ * sort also by hint (data disk
+ * higher priority)
+ */
+ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint];
+ } else { /* BTRFS_BLOCK_GROUP_METADATA */
+ hint = device->type & BTRFS_DEV_ALLOCATION_MASK;
+
+ /*
+ * skip BTRFS_DEV_DATA_ONLY disks
+ */
+ if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY)
+ continue;
+ /*
+ * if a data chunk must be allocated,
+ * sort also by hint (metadata hint
+ * higher priority)
+ */
+ devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
+ }
+
++ndevs;
}
ctl->ndevs = ndevs;
+ /*
+ * no devices available
+ */
+ if (!ndevs)
+ return 0;
+
/*
* now sort the devices by hole size / available space
*/
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
+ /*
+ * select the minimum set of disks grouped by hint that
+ * can host the chunk
+ */
+ ndevs = 0;
+ while (ndevs < ctl->ndevs) {
+ hint = devices_info[ndevs++].alloc_hint;
+ while (ndevs < ctl->ndevs &&
+ devices_info[ndevs].alloc_hint == hint)
+ ndevs++;
+ if (ndevs >= ctl->devs_min)
+ break;
+ }
+
+ BUG_ON(ndevs > ctl->ndevs);
+ ctl->ndevs = ndevs;
+
+ /*
+ * the next layers require the devices_info ordered by
+ * max_avail. If we are returing two (or more) different
+ * group of alloc_hint, this is not always true. So sort
+ * these gain.
+ */
+
+ for (i = 0 ; i < ndevs ; i++)
+ devices_info[i].alloc_hint = 0;
+
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+
return 0;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7bb14d51bffc58..f3c5437e270a22 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -565,6 +565,7 @@ struct btrfs_device_info {
u64 dev_offset;
u64 max_avail;
u64 total_avail;
+ int alloc_hint;
};
struct btrfs_raid_attr {
From 1c1f2e27d3055b7721468c6980479a043f48e2b3 Mon Sep 17 00:00:00 2001
From: Kai Krakow <kk@netactive.de>
Date: Thu, 27 Jun 2024 20:05:58 +0200
Subject: [PATCH 5/5] btrfs: add allocator_hint for no allocation preferred
This is useful where you want to prevent new allocations of chunks on a
disk which is going to removed from the pool anyways, e.g. due to bad
blocks or because it's slow.
Signed-off-by: Kai Krakow <kai@kaishome.de>
---
fs/btrfs/volumes.c | 6 +++++-
include/uapi/linux/btrfs_tree.h | 2 ++
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e66700fc8dcd4e..c6aa93fae9aa65 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -194,6 +194,7 @@ static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = {
[BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0,
[BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1,
[BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2,
+ [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99,
/* the other values are set to 0 */
};
@@ -5289,7 +5290,10 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
* sort also by hint (metadata hint
* higher priority)
*/
- devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
+ if (hint == BTRFS_DEV_ALLOCATION_PREFERRED_NONE)
+ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint];
+ else
+ devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
}
++ndevs;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 71c6135dc7cfb2..92bcc59b129a97 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -590,6 +590,8 @@ struct btrfs_node {
#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL)
/* only data chunk allowed */
#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL)
+/* preferred no chunk, but chunks allowed */
+#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL)
/* 5..7 are unused values */
struct btrfs_dev_item {