* Common code for Soft Domain Scheduling
*
* Copyright (C) 2025-2025 Huawei Technologies Co., Ltd
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
*/
#include "sched.h"
#include <linux/sort.h>
static DEFINE_STATIC_KEY_TRUE(__soft_domain_switch);
static int __init soft_domain_switch_setup(char *str)
{
int val = 0;
if (kstrtoint(str, 0, &val))
pr_warn("sched_soft_domain parameter is error: %s\n", str);
else {
if (val == 1)
static_branch_enable(&__soft_domain_switch);
else if (val == 0)
static_branch_disable(&__soft_domain_switch);
}
return 1;
}
__setup("sched_soft_domain=", soft_domain_switch_setup);
static bool soft_domain_enabled(void)
{
return static_branch_likely(&__soft_domain_switch);
}
static DEFINE_PER_CPU(struct soft_domain *, g_sf_d);
static void free_sub_soft_domain(struct soft_domain *sf_d);
static int build_soft_sub_domain(int nid, struct cpumask *cpus)
{
const struct cpumask *span = cpumask_of_node(nid);
struct soft_domain *sf_d = NULL;
int i;
sf_d = kzalloc_node(sizeof(struct soft_domain) + cpumask_size(),
GFP_KERNEL, nid);
if (!sf_d)
return -ENOMEM;
INIT_LIST_HEAD(&sf_d->child_domain);
sf_d->nr_available_cpus = cpumask_weight(span);
cpumask_copy(to_cpumask(sf_d->span), span);
for_each_cpu_and(i, span, cpus) {
struct soft_subdomain *sub_d = NULL;
sub_d = kzalloc_node(sizeof(struct soft_subdomain) + cpumask_size(),
GFP_KERNEL, nid);
if (!sub_d) {
free_sub_soft_domain(sf_d);
return -ENOMEM;
}
list_add_tail(&sub_d->node, &sf_d->child_domain);
cpumask_and(soft_domain_span(sub_d->span), span, topology_cluster_cpumask(i));
cpumask_andnot(cpus, cpus, topology_cluster_cpumask(i));
}
for_each_cpu(i, span) {
rcu_assign_pointer(per_cpu(g_sf_d, i), sf_d);
}
return 0;
}
static void free_sub_soft_domain(struct soft_domain *sf_d)
{
struct list_head *children = &sf_d->child_domain;
struct soft_subdomain *entry = NULL, *next = NULL;
int i;
list_for_each_entry_safe(entry, next, children, node) {
list_del(&entry->node);
kfree(entry);
}
for_each_cpu(i, to_cpumask(sf_d->span)) {
rcu_assign_pointer(per_cpu(g_sf_d, i), NULL);
}
kfree(sf_d);
}
static void free_soft_domain(void)
{
struct soft_domain *sf_d = NULL;
int i;
for_each_cpu(i, cpu_active_mask) {
sf_d = rcu_dereference(per_cpu(g_sf_d, i));
if (sf_d)
free_sub_soft_domain(sf_d);
}
static_branch_disable(&__soft_domain_switch);
}
void build_soft_domain(void)
{
static struct cpumask cpus;
int i, ret;
if (!soft_domain_enabled())
return;
cpumask_copy(&cpus, cpu_active_mask);
rcu_read_lock();
for (i = 0; i < nr_node_ids; i++) {
ret = build_soft_sub_domain(i, &cpus);
if (ret) {
free_soft_domain();
goto out;
}
}
out:
rcu_read_unlock();
}
static DEFINE_MUTEX(soft_domain_mutex);
#define NR_MAX_CLUSTER 16
struct domain_node {
struct soft_subdomain *sud_d;
unsigned int attached;
int idx;
unsigned long util;
};
static int subdomain_cmp(const void *a, const void *b)
{
struct domain_node *ca = (struct domain_node *)a;
struct domain_node *cb = (struct domain_node *)b;
if (ca->attached == cb->attached && ca->attached == 0) {
if (ca->idx < cb->idx)
return -1;
else
return 1;
}
if (ca->attached < cb->attached ||
(ca->attached == cb->attached && ca->util < cb->util))
return -1;
return 1;
}
struct soft_domain_args {
int policy;
int nr_cpu;
struct cpumask *cpus;
};
static int tg_set_soft_domain(struct task_group *tg, void *data)
{
struct soft_domain_args *args = (struct soft_domain_args *)data;
tg->sf_ctx->policy = args->policy;
if (args->policy) {
cpumask_copy(to_cpumask(tg->sf_ctx->span), args->cpus);
tg->sf_ctx->nr_cpus = args->nr_cpu;
} else
cpumask_clear(to_cpumask(tg->sf_ctx->span));
return 0;
}
static int __calc_cpu(struct task_group *tg)
{
int nr_cpu = 1;
if (tg->sf_ctx->nr_cpus)
nr_cpu = tg->sf_ctx->nr_cpus;
#ifdef CONFIG_CFS_BANDWIDTH
else if (tg->cfs_bandwidth.quota != RUNTIME_INF)
nr_cpu = DIV_ROUND_UP_ULL(tg->cfs_bandwidth.quota, tg->cfs_bandwidth.period);
#endif
return nr_cpu;
}
static unsigned long sum_util(struct cpumask *mask)
{
unsigned long sum = 0;
int cpu;
for_each_cpu(cpu, mask)
sum += cpu_util_cfs(cpu_rq(cpu));
return sum;
}
static int __check_policy(struct task_group *tg, void *data)
{
return !!tg->sf_ctx->policy;
}
static int check_policy(struct task_group *tg, long policy)
{
int ret;
rcu_read_lock();
ret = walk_tg_tree_from(tg, __check_policy, tg_nop, NULL);
rcu_read_unlock();
return ret;
}
static struct soft_domain *find_idlest_llc(long policy,
int nr_cpu, cpumask_var_t cpus)
{
int cpu;
int max_cpu = 0;
struct soft_domain *idlest = NULL;
unsigned long min_util = ULONG_MAX;
if (policy > 0) {
for_each_cpu(cpu, cpumask_of_node(policy-1)) {
idlest = rcu_dereference(per_cpu(g_sf_d, cpu));
if (idlest != NULL)
break;
}
if (idlest && nr_cpu <= cpumask_weight(to_cpumask(idlest->span)))
return idlest;
return NULL;
}
cpumask_copy(cpus, cpu_active_mask);
for_each_cpu(cpu, cpus) {
struct soft_domain *sf_d = NULL;
struct cpumask *mask;
sf_d = rcu_dereference(per_cpu(g_sf_d, cpu));
if (sf_d == NULL)
continue;
mask = to_cpumask(sf_d->span);
cpumask_andnot(cpus, cpus, mask);
if (nr_cpu > cpumask_weight(mask))
continue;
* LLC selection order:
* 1. When the number of idle cpus meet the requirements,
* the one with more idles cpus is better;
* 2. Under the condition of insufficient idle cpus, util
* is lower, the better.
*/
if (sf_d->nr_available_cpus > max_cpu &&
nr_cpu <= sf_d->nr_available_cpus) {
max_cpu = sf_d->nr_available_cpus;
idlest = sf_d;
} else if (max_cpu == 0) {
unsigned long util = sum_util(mask);
if (idlest == NULL || util < min_util) {
idlest = sf_d;
min_util = util;
}
}
}
return idlest;
}
static int __sched_group_set_soft_domain(struct task_group *tg, long policy)
{
int cpu;
int ret = 0;
cpumask_var_t cpus;
struct soft_domain_args args;
struct soft_domain *sf_d = NULL;
struct domain_node nodes[NR_MAX_CLUSTER] = {0};
int nr_cpu = __calc_cpu(tg);
if (check_policy(tg, policy))
return -EINVAL;
if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
return -EINVAL;
rcu_read_lock();
sf_d = find_idlest_llc(policy, nr_cpu, cpus);
if (sf_d != NULL) {
struct list_head *children = &sf_d->child_domain;
struct soft_subdomain *sub_d = NULL;
int nr = 0, i;
struct cpumask *tmpmask = NULL;
int tmp_cpu = nr_cpu;
list_for_each_entry(sub_d, children, node) {
nodes[nr].sud_d = sub_d;
nodes[nr].attached = sub_d->attached;
tmpmask = to_cpumask(sub_d->span);
cpu = cpumask_first(tmpmask);
nodes[nr].util = sum_util(tmpmask);
nodes[nr].idx = nr;
nr++;
}
cpumask_clear(cpus);
sort(nodes, nr, sizeof(struct domain_node), subdomain_cmp, NULL);
sf_d->nr_available_cpus -= tmp_cpu;
for (i = 0; i < nr; i++) {
sub_d = nodes[i].sud_d;
tmpmask = to_cpumask(sub_d->span);
cpumask_or(cpus, cpus, tmpmask);
sub_d->attached++;
nr_cpu -= cpumask_weight(tmpmask);
if (nr_cpu <= 0)
break;
}
args.policy = policy;
args.cpus = cpus;
args.nr_cpu = tmp_cpu;
walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args);
* 4.add tg to llc domain task_groups list for load balance.
*/
tg->sf_ctx->nr_cpus = tmp_cpu;
tg->sf_ctx->sf_d = sf_d;
} else {
ret = -EINVAL;
}
rcu_read_unlock();
free_cpumask_var(cpus);
return ret;
}
static int __sched_group_unset_soft_domain(struct task_group *tg)
{
struct soft_domain_args args = {
.policy = 0,
};
struct soft_domain *sf_d = NULL;
struct soft_subdomain *sub_d = NULL;
struct list_head *children = NULL;
if (tg->parent->sf_ctx != NULL && tg->parent->sf_ctx->policy != 0)
return -EINVAL;
sf_d = tg->sf_ctx->sf_d;
sf_d->nr_available_cpus += __calc_cpu(tg);
children = &sf_d->child_domain;
list_for_each_entry(sub_d, children, node) {
if (cpumask_intersects(to_cpumask(tg->sf_ctx->span), to_cpumask(sub_d->span)))
sub_d->attached--;
}
walk_tg_tree_from(tg, tg_set_soft_domain, tg_nop, &args);
return 0;
}
int sched_group_set_soft_domain(struct task_group *tg, long val)
{
int ret = 0;
if (!soft_domain_enabled())
return -EPERM;
if (val < -1 || val > nr_node_ids)
return -EINVAL;
mutex_lock(&soft_domain_mutex);
if (!!tg->sf_ctx->policy == !!val)
goto out;
if (val == 0)
ret = __sched_group_unset_soft_domain(tg);
else
ret = __sched_group_set_soft_domain(tg, val);
if (!ret)
tg->sf_ctx->policy = val;
out:
mutex_unlock(&soft_domain_mutex);
return ret;
}
int sched_group_set_soft_domain_quota(struct task_group *tg, long val)
{
int ret = 0;
if (!soft_domain_enabled())
return -EPERM;
mutex_lock(&soft_domain_mutex);
if (tg->sf_ctx->policy != 0) {
ret = -EINVAL;
goto out;
} else
tg->sf_ctx->nr_cpus = (int)val;
out:
mutex_unlock(&soft_domain_mutex);
return ret;
}
int init_soft_domain(struct task_group *tg, struct task_group *parent)
{
struct soft_domain_ctx *sf_ctx = NULL;
struct soft_domain_ctx *psf_ctx = NULL;
if (!soft_domain_enabled())
return 0;
sf_ctx = kzalloc(sizeof(*sf_ctx) + cpumask_size(), GFP_KERNEL);
if (!sf_ctx)
return -ENOMEM;
mutex_lock(&soft_domain_mutex);
psf_ctx = parent->sf_ctx;
if (psf_ctx) {
sf_ctx->policy = psf_ctx->policy;
sf_ctx->nr_cpus = psf_ctx->nr_cpus;
cpumask_copy(to_cpumask(sf_ctx->span), to_cpumask(psf_ctx->span));
}
tg->sf_ctx = sf_ctx;
mutex_unlock(&soft_domain_mutex);
return 0;
}
void offline_soft_domain(struct task_group *tg)
{
struct soft_domain_ctx *sf_ctx = NULL;
struct soft_domain_ctx *psf_ctx = NULL;
if (!soft_domain_enabled())
return;
sf_ctx = tg->sf_ctx;
psf_ctx = tg->parent->sf_ctx;
if (!sf_ctx)
return;
mutex_lock(&soft_domain_mutex);
if (sf_ctx->policy != 0) {
* parent group is not set, this group set
* soft domain by user.
*/
if (psf_ctx == NULL || psf_ctx->policy == 0)
__sched_group_unset_soft_domain(tg);
}
mutex_unlock(&soft_domain_mutex);
}
int destroy_soft_domain(struct task_group *tg)
{
if (!soft_domain_enabled())
return 0;
kfree(tg->sf_ctx);
return 0;
}