diff --git a/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v2.patch b/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v2.patch new file mode 100644 index 0000000..acb4d95 --- /dev/null +++ b/Btrfs/Allocator Hints/btrfs_allocator_hints-6.12_v2.patch @@ -0,0 +1,514 @@ +From 5e49c78f38cc7f5b7ec012021c8422c1db98ef7e Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:04 +0200 +Subject: [PATCH 1/6] btrfs: add flags to give an hint to the chunk allocator + +Add the following flags to give an hint about which chunk should be +allocated in which a disk. +The following flags are created: + +- BTRFS_DEV_ALLOCATION_PREFERRED_DATA + preferred data chunk, but metadata chunk allowed +- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA + preferred metadata chunk, but data chunk allowed +- BTRFS_DEV_ALLOCATION_METADATA_ONLY + only metadata chunk allowed +- BTRFS_DEV_ALLOCATION_DATA_ONLY + only data chunk allowed + +Signed-off-by: Goffredo Baroncelli +--- + include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index fc29d273845d84..71c6135dc7cfb2 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -578,6 +578,20 @@ struct btrfs_node { + struct btrfs_key_ptr ptrs[]; + } __attribute__ ((__packed__)); + ++/* dev_item.type */ ++ ++/* btrfs chunk allocation hints */ ++#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3 ++/* preferred data chunk, but metadata chunk allowed */ ++#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL) ++/* preferred metadata chunk, but data chunk allowed */ ++#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL) ++/* only metadata chunk are allowed */ ++#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) ++/* only data chunk allowed */ ++#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) ++/* 5..7 are unused values */ ++ + struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + +From 160344ae9ae37b32593adc43716172c37b0a734c Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:05 +0200 +Subject: [PATCH 2/6] btrfs: export dev_item.type in + /sys/fs/btrfs//devinfo//type + +Signed-off-by: Goffredo Baroncelli +--- + fs/btrfs/sysfs.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 03926ad467c919..fe07a7cbcf74c4 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1972,6 +1972,16 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, + } + BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); + ++static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, ++ struct kobj_attribute *a, char *buf) ++{ ++ struct btrfs_device *device = container_of(kobj, struct btrfs_device, ++ devid_kobj); ++ ++ return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); ++} ++BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); ++ + /* + * Information about one device. + * +@@ -1985,6 +1995,7 @@ static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, scrub_speed_max), + BTRFS_ATTR_PTR(devid, writeable), ++ BTRFS_ATTR_PTR(devid, type), + NULL + }; + ATTRIBUTE_GROUPS(devid); + +From 29637f2e3a69fe77a8097bd772a8a7803b9ec576 Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:06 +0200 +Subject: [PATCH 3/6] btrfs: change the DEV_ITEM 'type' field via sysfs + +Signed-off-by: Kai Krakow +--- + fs/btrfs/sysfs.c | 56 +++++++++++++++++++++++++++++++++++++++++++++- + fs/btrfs/volumes.c | 2 +- + fs/btrfs/volumes.h | 2 ++ + 3 files changed, 58 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index fe07a7cbcf74c4..3675d961b39a2a 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1980,7 +1980,61 @@ static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, + + return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); + } +-BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); ++ ++static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, ++ struct kobj_attribute *a, ++ const char *buf, size_t len) ++{ ++ struct btrfs_fs_info *fs_info; ++ struct btrfs_root *root; ++ struct btrfs_device *device; ++ int ret; ++ struct btrfs_trans_handle *trans; ++ ++ u64 type, prev_type; ++ ++ device = container_of(kobj, struct btrfs_device, devid_kobj); ++ fs_info = device->fs_info; ++ if (!fs_info) ++ return -EPERM; ++ ++ root = fs_info->chunk_root; ++ if (sb_rdonly(fs_info->sb)) ++ return -EROFS; ++ ++ ret = kstrtou64(buf, 0, &type); ++ if (ret < 0) ++ return -EINVAL; ++ ++ /* for now, allow to touch only the 'allocation hint' bits */ ++ if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)) ++ return -EINVAL; ++ ++ trans = btrfs_start_transaction(root, 1); ++ if (IS_ERR(trans)) ++ return PTR_ERR(trans); ++ ++ prev_type = device->type; ++ device->type = type; ++ ++ ret = btrfs_update_device(trans, device); ++ ++ if (ret < 0) { ++ btrfs_abort_transaction(trans, ret); ++ btrfs_end_transaction(trans); ++ goto abort; ++ } ++ ++ ret = btrfs_commit_transaction(trans); ++ if (ret < 0) ++ goto abort; ++ ++ return len; ++abort: ++ device->type = prev_type; ++ return ret; ++} ++BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); + + /* + * Information about one device. +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index eb51b609190fb5..620a9ea74e7558 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2882,7 +2882,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path + return ret; + } + +-static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, ++noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) + { + int ret; +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 4481575dd70f35..7bb14d51bffc58 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -836,6 +836,8 @@ int btrfs_bg_type_to_factor(u64 flags); + const char *btrfs_bg_type_to_raid_name(u64 flags); + int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); + bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); ++int btrfs_update_device(struct btrfs_trans_handle *trans, ++ struct btrfs_device *device); + + bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); + +From 970b99e160487e9765b6e7db9f8a89a96ce79811 Mon Sep 17 00:00:00 2001 +From: Goffredo Baroncelli +Date: Sun, 24 Oct 2021 17:31:07 +0200 +Subject: [PATCH 4/6] btrfs: add allocator_hint mode + +When this mode is enabled, the chunk allocation policy is modified as +follow. + +Each disk may have a different tag: +- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA +- BTRFS_DEV_ALLOCATION_METADATA_ONLY +- BTRFS_DEV_ALLOCATION_DATA_ONLY +- BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default) + +Where: +- ALLOCATION_PREFERRED_X means that it is preferred to use this disk for +the X chunk type (the other type may be allowed when the space is low) +- ALLOCATION_X_ONLY means that it is used *only* for the X chunk type. +This means also that it is a preferred choice. + +Each time the allocator allocates a chunk of type X , first it takes the +disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X; if the space +is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY; +if the space is not enough, it uses also the other disks, with the +exception of the one marked as ALLOCATION_PREFERRED_Y, where Y the other +type of chunk (i.e. not X). + +Signed-off-by: Goffredo Baroncelli +--- + fs/btrfs/volumes.c | 97 +++++++++++++++++++++++++++++++++++++++++++++- + fs/btrfs/volumes.h | 1 + + 2 files changed, 97 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 620a9ea74e7558..e66700fc8dcd4e 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -184,6 +184,19 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags + return BTRFS_BG_FLAG_TO_INDEX(profile); + } + ++#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \ ++ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1) ++#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \ ++ BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) ++ ++static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { ++ [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, ++ [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, ++ [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, ++ [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, ++ /* the other values are set to 0 */ ++}; ++ + const char *btrfs_bg_type_to_raid_name(u64 flags) + { + const int index = btrfs_bg_flags_to_raid_index(flags); +@@ -5022,13 +5035,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, + } + + /* +- * sort the devices in descending order by max_avail, total_avail ++ * sort the devices in descending order by alloc_hint, ++ * max_avail, total_avail + */ + static int btrfs_cmp_device_info(const void *a, const void *b) + { + const struct btrfs_device_info *di_a = a; + const struct btrfs_device_info *di_b = b; + ++ if (di_a->alloc_hint > di_b->alloc_hint) ++ return -1; ++ if (di_a->alloc_hint < di_b->alloc_hint) ++ return 1; + if (di_a->max_avail > di_b->max_avail) + return -1; + if (di_a->max_avail < di_b->max_avail) +@@ -5181,6 +5199,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + int ndevs = 0; + u64 max_avail; + u64 dev_offset; ++ int hint; ++ int i; + + /* + * in the first pass through the devices list, we gather information +@@ -5233,16 +5253,91 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; ++ ++ if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && ++ (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { ++ /* ++ * if mixed bg set all the alloc_hint ++ * fields to the same value, so the sorting ++ * is not affected ++ */ ++ devices_info[ndevs].alloc_hint = 0; ++ } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { ++ hint = device->type & BTRFS_DEV_ALLOCATION_MASK; ++ ++ /* ++ * skip BTRFS_DEV_METADATA_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) ++ continue; ++ /* ++ * if a data chunk must be allocated, ++ * sort also by hint (data disk ++ * higher priority) ++ */ ++ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; ++ } else { /* BTRFS_BLOCK_GROUP_METADATA */ ++ hint = device->type & BTRFS_DEV_ALLOCATION_MASK; ++ ++ /* ++ * skip BTRFS_DEV_DATA_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) ++ continue; ++ /* ++ * if a data chunk must be allocated, ++ * sort also by hint (metadata hint ++ * higher priority) ++ */ ++ devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; ++ } ++ + ++ndevs; + } + ctl->ndevs = ndevs; + ++ /* ++ * no devices available ++ */ ++ if (!ndevs) ++ return 0; ++ + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + ++ /* ++ * select the minimum set of disks grouped by hint that ++ * can host the chunk ++ */ ++ ndevs = 0; ++ while (ndevs < ctl->ndevs) { ++ hint = devices_info[ndevs++].alloc_hint; ++ while (ndevs < ctl->ndevs && ++ devices_info[ndevs].alloc_hint == hint) ++ ndevs++; ++ if (ndevs >= ctl->devs_min) ++ break; ++ } ++ ++ BUG_ON(ndevs > ctl->ndevs); ++ ctl->ndevs = ndevs; ++ ++ /* ++ * the next layers require the devices_info ordered by ++ * max_avail. If we are returing two (or more) different ++ * group of alloc_hint, this is not always true. So sort ++ * these gain. ++ */ ++ ++ for (i = 0 ; i < ndevs ; i++) ++ devices_info[i].alloc_hint = 0; ++ ++ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), ++ btrfs_cmp_device_info, NULL); ++ + return 0; + } + +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 7bb14d51bffc58..f3c5437e270a22 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -565,6 +565,7 @@ struct btrfs_device_info { + u64 dev_offset; + u64 max_avail; + u64 total_avail; ++ int alloc_hint; + }; + + struct btrfs_raid_attr { + +From 1c1f2e27d3055b7721468c6980479a043f48e2b3 Mon Sep 17 00:00:00 2001 +From: Kai Krakow +Date: Thu, 27 Jun 2024 20:05:58 +0200 +Subject: [PATCH 5/6] btrfs: add allocator_hint for no allocation preferred + +This is useful where you want to prevent new allocations of chunks on a +disk which is going to removed from the pool anyways, e.g. due to bad +blocks or because it's slow. + +Signed-off-by: Kai Krakow +--- + fs/btrfs/volumes.c | 6 +++++- + include/uapi/linux/btrfs_tree.h | 2 ++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index e66700fc8dcd4e..c6aa93fae9aa65 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -194,6 +194,7 @@ static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, + [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, ++ [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99, + /* the other values are set to 0 */ + }; + +@@ -5289,7 +5290,10 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + * sort also by hint (metadata hint + * higher priority) + */ +- devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; ++ if (hint == BTRFS_DEV_ALLOCATION_PREFERRED_NONE) ++ devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; ++ else ++ devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } + + ++ndevs; +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index 71c6135dc7cfb2..92bcc59b129a97 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -590,6 +590,8 @@ struct btrfs_node { + #define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) + /* only data chunk allowed */ + #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) ++/* preferred no chunk, but chunks allowed */ ++#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) + /* 5..7 are unused values */ + + struct btrfs_dev_item { + +From 82553effe6b655f97478b6d13df7ab0ecc192e58 Mon Sep 17 00:00:00 2001 +From: Kai Krakow +Date: Fri, 6 Dec 2024 00:55:31 +0100 +Subject: [PATCH 6/6] btrfs: add allocator_hint to disable allocation + completely + +This is useful where you want to prevent new allocations of chunks to +a set of multiple disks which are going to be removed from the pool. +This acts as a multiple `btrfs dev remove` on steroids that can remove +multiple disks in parallel without moving data to disks which would be +removed in the next round. In such cases, it will avoid moving the +same data multiple times, and thus avoid placing it on potentially bad +disks. + +Thanks to @Zygo for the explanation and suggestion. + +Link: https://github.com/kdave/btrfs-progs/issues/907#issuecomment-2520897104 +Signed-off-by: Kai Krakow +--- + fs/btrfs/volumes.c | 11 +++++++++++ + include/uapi/linux/btrfs_tree.h | 4 +++- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index c6aa93fae9aa65..99d2c60ac2bf3e 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -190,6 +190,7 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) + + static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { ++ [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99, + [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, +@@ -5271,6 +5272,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + */ + if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) + continue; ++ /* ++ * skip BTRFS_DEV_NONE_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) ++ continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk +@@ -5285,6 +5291,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, + */ + if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) + continue; ++ /* ++ * skip BTRFS_DEV_NONE_ONLY disks ++ */ ++ if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) ++ continue; + /* + * if a data chunk must be allocated, + * sort also by hint (metadata hint +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index 92bcc59b129a97..3db20734aacfc6 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -592,7 +592,9 @@ struct btrfs_node { + #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) + /* preferred no chunk, but chunks allowed */ + #define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) +-/* 5..7 are unused values */ ++/* no chunks allowed */ ++#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL) ++/* 6..7 are unused values */ + + struct btrfs_dev_item { + /* the internal btrfs device id */