From 5e49c78f38cc7f5b7ec012021c8422c1db98ef7e Mon Sep 17 00:00:00 2001 From: Goffredo Baroncelli Date: Sun, 24 Oct 2021 17:31:04 +0200 Subject: [PATCH 1/6] btrfs: add flags to give an hint to the chunk allocator Add the following flags to give an hint about which chunk should be allocated in which a disk. The following flags are created: - BTRFS_DEV_ALLOCATION_PREFERRED_DATA preferred data chunk, but metadata chunk allowed - BTRFS_DEV_ALLOCATION_PREFERRED_METADATA preferred metadata chunk, but data chunk allowed - BTRFS_DEV_ALLOCATION_METADATA_ONLY only metadata chunk allowed - BTRFS_DEV_ALLOCATION_DATA_ONLY only data chunk allowed Signed-off-by: Goffredo Baroncelli --- include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d84..71c6135dc7cfb2 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -578,6 +578,20 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +/* dev_item.type */ + +/* btrfs chunk allocation hints */ +#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3 +/* preferred data chunk, but metadata chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL) +/* preferred metadata chunk, but data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL) +/* only metadata chunk are allowed */ +#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) +/* only data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) +/* 5..7 are unused values */ + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid; From 160344ae9ae37b32593adc43716172c37b0a734c Mon Sep 17 00:00:00 2001 From: Goffredo Baroncelli Date: Sun, 24 Oct 2021 17:31:05 +0200 Subject: [PATCH 2/6] btrfs: export dev_item.type in /sys/fs/btrfs//devinfo//type Signed-off-by: Goffredo Baroncelli --- fs/btrfs/sysfs.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c919..fe07a7cbcf74c4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1972,6 +1972,16 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); +static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); +} +BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); + /* * Information about one device. * @@ -1985,6 +1995,7 @@ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), + BTRFS_ATTR_PTR(devid, type), NULL }; ATTRIBUTE_GROUPS(devid); From 29637f2e3a69fe77a8097bd772a8a7803b9ec576 Mon Sep 17 00:00:00 2001 From: Goffredo Baroncelli Date: Sun, 24 Oct 2021 17:31:06 +0200 Subject: [PATCH 3/6] btrfs: change the DEV_ITEM 'type' field via sysfs Signed-off-by: Kai Krakow --- fs/btrfs/sysfs.c | 56 +++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 2 ++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fe07a7cbcf74c4..3675d961b39a2a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1980,7 +1980,61 @@ static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); } -BTRFS_ATTR(devid, type, btrfs_devinfo_type_show); + +static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_device *device; + int ret; + struct btrfs_trans_handle *trans; + + u64 type, prev_type; + + device = container_of(kobj, struct btrfs_device, devid_kobj); + fs_info = device->fs_info; + if (!fs_info) + return -EPERM; + + root = fs_info->chunk_root; + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtou64(buf, 0, &type); + if (ret < 0) + return -EINVAL; + + /* for now, allow to touch only the 'allocation hint' bits */ + if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)) + return -EINVAL; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + prev_type = device->type; + device->type = type; + + ret = btrfs_update_device(trans, device); + + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto abort; + } + + ret = btrfs_commit_transaction(trans); + if (ret < 0) + goto abort; + + return len; +abort: + device->type = prev_type; + return ret; +} +BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); /* * Information about one device. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eb51b609190fb5..620a9ea74e7558 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2882,7 +2882,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; } -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, +noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4481575dd70f35..7bb14d51bffc58 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -836,6 +836,8 @@ int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); From 970b99e160487e9765b6e7db9f8a89a96ce79811 Mon Sep 17 00:00:00 2001 From: Goffredo Baroncelli Date: Sun, 24 Oct 2021 17:31:07 +0200 Subject: [PATCH 4/6] btrfs: add allocator_hint mode When this mode is enabled, the chunk allocation policy is modified as follow. Each disk may have a different tag: - BTRFS_DEV_ALLOCATION_PREFERRED_METADATA - BTRFS_DEV_ALLOCATION_METADATA_ONLY - BTRFS_DEV_ALLOCATION_DATA_ONLY - BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default) Where: - ALLOCATION_PREFERRED_X means that it is preferred to use this disk for the X chunk type (the other type may be allowed when the space is low) - ALLOCATION_X_ONLY means that it is used *only* for the X chunk type. This means also that it is a preferred choice. Each time the allocator allocates a chunk of type X , first it takes the disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X; if the space is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY; if the space is not enough, it uses also the other disks, with the exception of the one marked as ALLOCATION_PREFERRED_Y, where Y the other type of chunk (i.e. not X). Signed-off-by: Goffredo Baroncelli --- fs/btrfs/volumes.c | 97 +++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 1 + 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 620a9ea74e7558..e66700fc8dcd4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -184,6 +184,19 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags return BTRFS_BG_FLAG_TO_INDEX(profile); } +#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1) +#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) + +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, + [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, + /* the other values are set to 0 */ +}; + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -5022,13 +5035,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, } /* - * sort the devices in descending order by max_avail, total_avail + * sort the devices in descending order by alloc_hint, + * max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; + if (di_a->alloc_hint > di_b->alloc_hint) + return -1; + if (di_a->alloc_hint < di_b->alloc_hint) + return 1; if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) @@ -5181,6 +5199,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, int ndevs = 0; u64 max_avail; u64 dev_offset; + int hint; + int i; /* * in the first pass through the devices list, we gather information @@ -5233,16 +5253,91 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; + + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { + /* + * if mixed bg set all the alloc_hint + * fields to the same value, so the sorting + * is not affected + */ + devices_info[ndevs].alloc_hint = 0; + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { + hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_METADATA_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk + * higher priority) + */ + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + } else { /* BTRFS_BLOCK_GROUP_METADATA */ + hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_DATA_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (metadata hint + * higher priority) + */ + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } + ++ndevs; } ctl->ndevs = ndevs; + /* + * no devices available + */ + if (!ndevs) + return 0; + /* * now sort the devices by hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); + /* + * select the minimum set of disks grouped by hint that + * can host the chunk + */ + ndevs = 0; + while (ndevs < ctl->ndevs) { + hint = devices_info[ndevs++].alloc_hint; + while (ndevs < ctl->ndevs && + devices_info[ndevs].alloc_hint == hint) + ndevs++; + if (ndevs >= ctl->devs_min) + break; + } + + BUG_ON(ndevs > ctl->ndevs); + ctl->ndevs = ndevs; + + /* + * the next layers require the devices_info ordered by + * max_avail. If we are returing two (or more) different + * group of alloc_hint, this is not always true. So sort + * these gain. + */ + + for (i = 0 ; i < ndevs ; i++) + devices_info[i].alloc_hint = 0; + + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7bb14d51bffc58..f3c5437e270a22 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -565,6 +565,7 @@ struct btrfs_device_info { u64 dev_offset; u64 max_avail; u64 total_avail; + int alloc_hint; }; struct btrfs_raid_attr { From 1c1f2e27d3055b7721468c6980479a043f48e2b3 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Thu, 27 Jun 2024 20:05:58 +0200 Subject: [PATCH 5/6] btrfs: add allocator_hint for no allocation preferred This is useful where you want to prevent new allocations of chunks on a disk which is going to removed from the pool anyways, e.g. due to bad blocks or because it's slow. Signed-off-by: Kai Krakow --- fs/btrfs/volumes.c | 6 +++++- include/uapi/linux/btrfs_tree.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e66700fc8dcd4e..c6aa93fae9aa65 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -194,6 +194,7 @@ static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, + [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99, /* the other values are set to 0 */ }; @@ -5289,7 +5290,10 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, * sort also by hint (metadata hint * higher priority) */ - devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + if (hint == BTRFS_DEV_ALLOCATION_PREFERRED_NONE) + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + else + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; } ++ndevs; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 71c6135dc7cfb2..92bcc59b129a97 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -590,6 +590,8 @@ struct btrfs_node { #define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) /* only data chunk allowed */ #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) +/* preferred no chunk, but chunks allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) /* 5..7 are unused values */ struct btrfs_dev_item { From 82553effe6b655f97478b6d13df7ab0ecc192e58 Mon Sep 17 00:00:00 2001 From: Kai Krakow Date: Fri, 6 Dec 2024 00:55:31 +0100 Subject: [PATCH 6/6] btrfs: add allocator_hint to disable allocation completely This is useful where you want to prevent new allocations of chunks to a set of multiple disks which are going to be removed from the pool. This acts as a multiple `btrfs dev remove` on steroids that can remove multiple disks in parallel without moving data to disks which would be removed in the next round. In such cases, it will avoid moving the same data multiple times, and thus avoid placing it on potentially bad disks. Thanks to @Zygo for the explanation and suggestion. Link: https://github.com/kdave/btrfs-progs/issues/907#issuecomment-2520897104 Signed-off-by: Kai Krakow --- fs/btrfs/volumes.c | 11 +++++++++++ include/uapi/linux/btrfs_tree.h | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6aa93fae9aa65..99d2c60ac2bf3e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -190,6 +190,7 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99, [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, @@ -5271,6 +5272,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, */ if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) + continue; /* * if a data chunk must be allocated, * sort also by hint (data disk @@ -5285,6 +5291,11 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, */ if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) + continue; /* * if a data chunk must be allocated, * sort also by hint (metadata hint diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 92bcc59b129a97..3db20734aacfc6 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -592,7 +592,9 @@ struct btrfs_node { #define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) /* preferred no chunk, but chunks allowed */ #define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) -/* 5..7 are unused values */ +/* no chunks allowed */ +#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL) +/* 6..7 are unused values */ struct btrfs_dev_item { /* the internal btrfs device id */