diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 262a94621446..05f5935eeac8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2506,7 +2506,9 @@ specified in the flag list (default: domain): nohz - Disable the tick when a single task runs. + Disable the tick when a single task runs as well as + disabling other kernel noises like having RCU callbacks + offloaded. This is equivalent to the nohz_full parameter. A residual 1Hz tick is offloaded to workqueues, which you need to affine to housekeeping through the global diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index 7c2b16c4729d..caea83d91c67 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -2,6 +2,12 @@ Scheduler Statistics ==================== +Version 17 of schedstats removed 'lb_imbalance' field as it has no +significance anymore and instead added more relevant fields namely +'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and +'lb_imbalance_misfit'. The domain field prints the name of the +corresponding sched domain from this version onwards. + Version 16 of schedstats changed the order of definitions within 'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] columns in show_schedstat(). In particular the position of CPU_IDLE @@ -9,7 +15,9 @@ and __CPU_NOT_IDLE changed places. The size of the array is unchanged. Version 15 of schedstats dropped counters for some sched_yield: yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is -identical to version 14. +identical to version 14. Details are available at + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-stats.txt?id=1e1dbb259c79b Version 14 of schedstats includes support for sched_domains, which hit the mainline kernel in 2.6.20 although it is identical to the stats from version @@ -26,7 +34,14 @@ cpus on the machine, while domain0 is the most tightly focused domain, sometimes balancing only between pairs of cpus. At this time, there are no architectures which need more than three domain levels. The first field in the domain stats is a bit map indicating which cpus are affected -by that domain. +by that domain. Details are available at + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=b762f3ffb797c + +The schedstat documentation is maintained version 10 onwards and is not +updated for version 11 and 12. The details for version 10 are available at + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=1da177e4c3f4 These fields are counters, and only increment. Programs which make use of these will need to start with a baseline observation and then calculate @@ -71,88 +86,97 @@ Domain statistics ----------------- One of these is produced per domain for each cpu described. (Note that if CONFIG_SMP is not defined, *no* domains are utilized and these lines -will not appear in the output.) +will not appear in the output. is an extension to the domain field +that prints the name of the corresponding sched domain. It can appear in +schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.) -domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 +domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 The first field is a bit mask indicating what cpus this domain operates over. -The next 24 are a variety of sched_balance_rq() statistics in grouped into types -of idleness (idle, busy, and newly idle): +The next 33 are a variety of sched_balance_rq() statistics in grouped into types +of idleness (busy, idle and newly idle): 1) # of times in this domain sched_balance_rq() was called when the - cpu was idle - 2) # of times in this domain sched_balance_rq() checked but found - the load did not require balancing when the cpu was idle - 3) # of times in this domain sched_balance_rq() tried to move one or - more tasks and failed, when the cpu was idle - 4) sum of imbalances discovered (if any) with each call to - sched_balance_rq() in this domain when the cpu was idle - 5) # of times in this domain pull_task() was called when the cpu - was idle - 6) # of times in this domain pull_task() was called even though - the target task was cache-hot when idle - 7) # of times in this domain sched_balance_rq() was called but did - not find a busier queue while the cpu was idle - 8) # of times in this domain a busier queue was found while the - cpu was idle but no busier group was found - 9) # of times in this domain sched_balance_rq() was called when the cpu was busy - 10) # of times in this domain sched_balance_rq() checked but found the + 2) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when busy - 11) # of times in this domain sched_balance_rq() tried to move one or + 3) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was busy - 12) sum of imbalances discovered (if any) with each call to - sched_balance_rq() in this domain when the cpu was busy - 13) # of times in this domain pull_task() was called when busy - 14) # of times in this domain pull_task() was called even though the + 4) Total imbalance in load when the cpu was busy + 5) Total imbalance in utilization when the cpu was busy + 6) Total imbalance in number of tasks when the cpu was busy + 7) Total imbalance due to misfit tasks when the cpu was busy + 8) # of times in this domain pull_task() was called when busy + 9) # of times in this domain pull_task() was called even though the target task was cache-hot when busy - 15) # of times in this domain sched_balance_rq() was called but did not + 10) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was busy - 16) # of times in this domain a busier queue was found while the cpu + 11) # of times in this domain a busier queue was found while the cpu was busy but no busier group was found - 17) # of times in this domain sched_balance_rq() was called when the - cpu was just becoming idle - 18) # of times in this domain sched_balance_rq() checked but found the + 12) # of times in this domain sched_balance_rq() was called when the + cpu was idle + 13) # of times in this domain sched_balance_rq() checked but found + the load did not require balancing when the cpu was idle + 14) # of times in this domain sched_balance_rq() tried to move one or + more tasks and failed, when the cpu was idle + 15) Total imbalance in load when the cpu was idle + 16) Total imbalance in utilization when the cpu was idle + 17) Total imbalance in number of tasks when the cpu was idle + 18) Total imbalance due to misfit tasks when the cpu was idle + 19) # of times in this domain pull_task() was called when the cpu + was idle + 20) # of times in this domain pull_task() was called even though + the target task was cache-hot when idle + 21) # of times in this domain sched_balance_rq() was called but did + not find a busier queue while the cpu was idle + 22) # of times in this domain a busier queue was found while the + cpu was idle but no busier group was found + + 23) # of times in this domain sched_balance_rq() was called when the + was just becoming idle + 24) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when the cpu was just becoming idle - 19) # of times in this domain sched_balance_rq() tried to move one or more + 25) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was just becoming idle - 20) sum of imbalances discovered (if any) with each call to - sched_balance_rq() in this domain when the cpu was just becoming idle - 21) # of times in this domain pull_task() was called when newly idle - 22) # of times in this domain pull_task() was called even though the + 26) Total imbalance in load when the cpu was just becoming idle + 27) Total imbalance in utilization when the cpu was just becoming idle + 28) Total imbalance in number of tasks when the cpu was just becoming idle + 29) Total imbalance due to misfit tasks when the cpu was just becoming idle + 30) # of times in this domain pull_task() was called when newly idle + 31) # of times in this domain pull_task() was called even though the target task was cache-hot when just becoming idle - 23) # of times in this domain sched_balance_rq() was called but did not + 32) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was just becoming idle - 24) # of times in this domain a busier queue was found while the cpu + 33) # of times in this domain a busier queue was found while the cpu was just becoming idle but no busier group was found Next three are active_load_balance() statistics: - 25) # of times active_load_balance() was called - 26) # of times active_load_balance() tried to move a task and failed - 27) # of times active_load_balance() successfully moved a task + 34) # of times active_load_balance() was called + 35) # of times active_load_balance() tried to move a task and failed + 36) # of times active_load_balance() successfully moved a task Next three are sched_balance_exec() statistics: - 28) sbe_cnt is not used - 29) sbe_balanced is not used - 30) sbe_pushed is not used + 37) sbe_cnt is not used + 38) sbe_balanced is not used + 39) sbe_pushed is not used Next three are sched_balance_fork() statistics: - 31) sbf_cnt is not used - 32) sbf_balanced is not used - 33) sbf_pushed is not used + 40) sbf_cnt is not used + 41) sbf_balanced is not used + 42) sbf_pushed is not used Next three are try_to_wake_up() statistics: - 34) # of times in this domain try_to_wake_up() awoke a task that + 43) # of times in this domain try_to_wake_up() awoke a task that last ran on a different cpu in this domain - 35) # of times in this domain try_to_wake_up() moved a task to the + 44) # of times in this domain try_to_wake_up() moved a task to the waking cpu because it was cache-cold on its own cpu anyway - 36) # of times in this domain try_to_wake_up() started passive balancing + 45) # of times in this domain try_to_wake_up() started passive balancing /proc//schedstat --------------------- diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 3973cb9bb2e6..ec134b719144 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -251,7 +251,7 @@ extern bool x86_topology_update; #include DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); -extern unsigned int __read_mostly sysctl_sched_itmt_enabled; +extern bool __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); @@ -264,7 +264,7 @@ void sched_clear_itmt_support(void); #else /* CONFIG_SCHED_MC_PRIO */ -#define sysctl_sched_itmt_enabled 0 +#define sysctl_sched_itmt_enabled false static inline void sched_set_itmt_core_prio(int prio, int core_cpu) { } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 51b805c727fc..9cea1fc36c18 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable; * of higher turbo frequency for cpus supporting Intel Turbo Boost Max * Technology 3.0. * - * It can be set via /proc/sys/kernel/sched_itmt_enabled + * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled */ -unsigned int __read_mostly sysctl_sched_itmt_enabled; +bool __read_mostly sysctl_sched_itmt_enabled; -static int sched_itmt_update_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static ssize_t sched_itmt_enabled_write(struct file *filp, + const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned int old_sysctl; - int ret; + ssize_t result; + bool orig; - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (!sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); - return -EINVAL; - } + orig = sysctl_sched_itmt_enabled; + result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); - old_sysctl = sysctl_sched_itmt_enabled; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + if (sysctl_sched_itmt_enabled != orig) { x86_topology_update = true; rebuild_sched_domains(); } - mutex_unlock(&itmt_update_mutex); - - return ret; + return result; } -static struct ctl_table itmt_kern_table[] = { - { - .procname = "sched_itmt_enabled", - .data = &sysctl_sched_itmt_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_itmt_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, +static const struct file_operations dfs_sched_itmt_fops = { + .read = debugfs_read_file_bool, + .write = sched_itmt_enabled_write, + .open = simple_open, + .llseek = default_llseek, }; -static struct ctl_table_header *itmt_sysctl_header; +static struct dentry *dfs_sched_itmt; /** * sched_set_itmt_support() - Indicate platform supports ITMT @@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header; */ int sched_set_itmt_support(void) { - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); + if (sched_itmt_capable) return 0; - } - itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table); - if (!itmt_sysctl_header) { - mutex_unlock(&itmt_update_mutex); + dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled", + 0644, + arch_debugfs_dir, + &sysctl_sched_itmt_enabled, + &dfs_sched_itmt_fops); + if (IS_ERR_OR_NULL(dfs_sched_itmt)) { + dfs_sched_itmt = NULL; return -ENOMEM; } @@ -117,8 +109,6 @@ int sched_set_itmt_support(void) x86_topology_update = true; rebuild_sched_domains(); - mutex_unlock(&itmt_update_mutex); - return 0; } @@ -134,18 +124,15 @@ int sched_set_itmt_support(void) */ void sched_clear_itmt_support(void) { - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (!sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); + if (!sched_itmt_capable) return; - } + sched_itmt_capable = false; - if (itmt_sysctl_header) { - unregister_sysctl_table(itmt_sysctl_header); - itmt_sysctl_header = NULL; - } + debugfs_remove(dfs_sched_itmt); + dfs_sched_itmt = NULL; if (sysctl_sched_itmt_enabled) { /* disable sched_itmt if we are no longer ITMT capable */ @@ -153,8 +140,6 @@ void sched_clear_itmt_support(void) x86_topology_update = true; rebuild_sched_domains(); } - - mutex_unlock(&itmt_update_mutex); } int arch_asym_cpu_priority(int cpu) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0e3f9bad0395..c10850ae6f09 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -483,12 +483,6 @@ static int x86_core_flags(void) return cpu_core_flags() | x86_sched_itmt_flags(); } #endif -#ifdef CONFIG_SCHED_SMT -static int x86_smt_flags(void) -{ - return cpu_smt_flags(); -} -#endif #ifdef CONFIG_SCHED_CLUSTER static int x86_cluster_flags(void) { @@ -496,15 +490,6 @@ static int x86_cluster_flags(void) } #endif -static int x86_die_flags(void) -{ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU) || - cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) - return x86_sched_itmt_flags(); - - return 0; -} - /* * Set if a package/die has multiple NUMA nodes inside. * AMD Magny-Cours, Intel Cluster-on-Die, and Intel @@ -520,7 +505,7 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }; #endif #ifdef CONFIG_SCHED_CLUSTER @@ -540,7 +525,7 @@ static void __init build_sched_topology(void) */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) + cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) }; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c7eb16ab1d5..ac08431e238f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -944,6 +944,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; @@ -1374,6 +1375,15 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; +# ifdef CONFIG_DEBUG_RSEQ + /* + * This is a place holder to save a copy of the rseq fields for + * validation of read-only fields. The struct rseq has a + * variable-length array at the end, so it cannot be used + * directly. Reserve a size large enough for the known fields. + */ + char rseq_fields[sizeof(struct rseq)]; +# endif #endif #ifdef CONFIG_SCHED_MM_CID diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 2b461129d1fa..d8501f4709b5 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -7,16 +7,21 @@ #include enum hk_type { - HK_TYPE_TIMER, - HK_TYPE_RCU, - HK_TYPE_MISC, - HK_TYPE_SCHED, - HK_TYPE_TICK, HK_TYPE_DOMAIN, - HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, - HK_TYPE_KTHREAD, - HK_TYPE_MAX + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4237daa5ac7a..7f3dbafe1817 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -114,7 +114,10 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; @@ -140,9 +143,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -198,18 +199,12 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..442aba29bc4c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -25,6 +26,78 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +#ifdef CONFIG_DEBUG_RSEQ +static struct rseq *rseq_kernel_fields(struct task_struct *t) +{ + return (struct rseq *) t->rseq_fields; +} + +static int rseq_validate_ro_fields(struct task_struct *t) +{ + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + u32 cpu_id_start, cpu_id, node_id, mm_cid; + struct rseq __user *rseq = t->rseq; + + /* + * Validate fields which are required to be read-only by + * user-space. + */ + if (!user_read_access_begin(rseq, t->rseq_len)) + goto efault; + unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); + unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_get_user(node_id, &rseq->node_id, efault_end); + unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); + user_read_access_end(); + + if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || + cpu_id != rseq_kernel_fields(t)->cpu_id || + node_id != rseq_kernel_fields(t)->node_id || + mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { + + pr_warn("Detected rseq corruption for pid: %d, name: %s\n" + "\tcpu_id_start: %u ?= %u\n" + "\tcpu_id: %u ?= %u\n" + "\tnode_id: %u ?= %u\n" + "\tmm_cid: %u ?= %u\n", + t->pid, t->comm, + cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, + cpu_id, rseq_kernel_fields(t)->cpu_id, + node_id, rseq_kernel_fields(t)->node_id, + mm_cid, rseq_kernel_fields(t)->mm_cid); + } + + /* For now, only print a console warning on mismatch. */ + return 0; + +efault_end: + user_read_access_end(); +efault: + return -EFAULT; +} + +static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, + u32 node_id, u32 mm_cid) +{ + rseq_kernel_fields(t)->cpu_id_start = cpu_id; + rseq_kernel_fields(t)->cpu_id = cpu_id; + rseq_kernel_fields(t)->node_id = node_id; + rseq_kernel_fields(t)->mm_cid = mm_cid; +} +#else +static int rseq_validate_ro_fields(struct task_struct *t) +{ + return 0; +} + +static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, + u32 node_id, u32 mm_cid) +{ +} +#endif + /* * * Restartable sequences are a lightweight interface that allows @@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t) u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); + /* + * Validate read-only rseq fields. + */ + if (rseq_validate_ro_fields(t)) + goto efault; WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; @@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t) * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); + rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -119,6 +198,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; + /* + * Validate read-only rseq fields. + */ + if (rseq_validate_ro_fields(t)) + return -EFAULT; /* * Reset cpu_id_start to its initial state (0). */ @@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; + + rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if @@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; +#ifdef CONFIG_DEBUG_RSEQ + /* + * Initialize the in-kernel rseq fields copy for validation of + * read-only fields. + */ + if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || + get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || + get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || + get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) + return -EFAULT; +#endif /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..4365b479e345 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -740,39 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) s64 __maybe_unused steal = 0, irq_delta = 0; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + if (irqtime_enabled()) { + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}IRQ region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}IRQ - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}IRQ region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}IRQ + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; - rq->prev_irq_time += irq_delta; - delta -= irq_delta; - delayacct_irq(rq->curr, irq_delta); + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + delayacct_irq(rq->curr, irq_delta); + } #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); + u64 prev_steal; + + steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - rq->prev_steal_time_rq += steal; + rq->prev_steal_time_rq = prev_steal; delta -= steal; } #endif @@ -1168,13 +1172,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); guard(rcu)(); @@ -1189,7 +1193,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); return default_cpu; } @@ -1341,7 +1345,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.h_nr_running > 1) + if (rq->cfs.h_nr_queued > 1) return false; /* @@ -5632,7 +5636,7 @@ void sched_tick(void) unsigned long hw_pressure; u64 resched_latency; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) arch_scale_freq_tick(); sched_clock_tick(); @@ -5771,7 +5775,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5792,7 +5796,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -6018,7 +6022,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * opportunity to pull in more work from other CPUs. */ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) @@ -6641,7 +6645,6 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; - bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6702,7 +6705,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - block = try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } @@ -6748,7 +6751,8 @@ static void __sched notrace __schedule(int sched_mode) migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, block); + psi_sched_switch(prev, next, !task_on_rq_queued(prev) || + prev->se.sched_delayed); trace_sched_switch(preempt, prev, next, prev_state); @@ -8180,19 +8184,14 @@ static void cpuset_cpu_active(void) cpuset_update_active_cpus(); } -static int cpuset_cpu_inactive(unsigned int cpu) +static void cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_bw_check_overflow(cpu); - - if (ret) - return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); } - return 0; } static inline void sched_smt_present_inc(int cpu) @@ -8254,6 +8253,11 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); int ret; + ret = dl_bw_deactivate(cpu); + + if (ret) + return ret; + /* * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active @@ -8299,15 +8303,7 @@ int sched_cpu_deactivate(unsigned int cpu) return 0; sched_update_numa(cpu, false); - ret = cpuset_cpu_inactive(cpu); - if (ret) { - sched_smt_present_inc(cpu); - sched_set_rq_online(rq, cpu); - balance_push_set(cpu, false); - set_cpu_active(cpu, true); - sched_update_numa(cpu, true); - return ret; - } + cpuset_cpu_inactive(cpu); sched_domains_numa_masks_clear(cpu); return 0; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0bed0fa1acd9..5d9143dd0879 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -9,6 +9,8 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); + /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -22,16 +24,14 @@ */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static int sched_clock_irqtime; - void enable_sched_clock_irqtime(void) { - sched_clock_irqtime = 1; + static_branch_enable(&sched_clock_irqtime); } void disable_sched_clock_irqtime(void) { - sched_clock_irqtime = 0; + static_branch_disable(&sched_clock_irqtime); } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, @@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) s64 delta; int cpu; - if (!sched_clock_irqtime) + if (!irqtime_enabled()) return; cpu = smp_processor_id(); @@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime) #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -#define sched_clock_irqtime (0) - static u64 irqtime_tick_accounted(u64 dummy) { return 0; @@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_process_tick(p, user_tick, 1); return; } @@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks) { u64 cputime, steal; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_idle_ticks(ticks); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d94f2ed6d1f4..62192ac79c30 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + static void dl_change_utilization(struct task_struct *p, u64 new_bw) { WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); @@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { - if (!dl_server(dl_se)) - put_task_struct(dl_task_of(dl_se)); - } + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -2115,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). - * - * If the timer callback was running (hrtimer_try_to_cancel == -1), - * it will eventually call put_task_struct(). */ - if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && - !dl_server(&p->dl)) - put_task_struct(p); + cancel_replenish_timer(&p->dl); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -2289,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); rq_unlock(rq, &rf); @@ -2506,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); - -next_node: - if (next_node) { + while (next_node) { p = __node_2_pdl(next_node); if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -2964,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p) void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_server bandwidth is only restored when CPUs are attached to root + * domains (after domains are created or CPUs moved back to the + * default root doamin). + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + rd->dl_bw.total_bw += dl_se->dl_bw; + } } #endif /* CONFIG_SMP */ @@ -3029,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); /* * In case a task is setscheduled to SCHED_DEADLINE we need to keep @@ -3453,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, } enum dl_bw_request { - dl_bw_req_check_overflow = 0, + dl_bw_req_deactivate = 0, dl_bw_req_alloc, dl_bw_req_free }; static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (req == dl_bw_req_free) { + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); - } else { - unsigned long cap = dl_bw_capacity(cpu); - + break; + case dl_bw_req_alloc: overflow = __dl_overflow(dl_b, cap, 0, dl_bw); - if (req == dl_bw_req_alloc && !overflow) { + if (!overflow) { /* * We reserve space in the destination * root_domain, as we can't fail after this point. @@ -3484,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) */ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3492,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) return overflow ? -EBUSY : 0; } -int dl_bw_check_overflow(int cpu) +int dl_bw_deactivate(int cpu) { - return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); } int dl_bw_alloc(int cpu, u64 dl_bw) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a1be00a988bf..fd7e85220715 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_running) { + if (rq->cfs.h_nr_queued) { update_rq_clock(rq); dl_server_stop(&rq->fair_server); } @@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_running) + if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); } @@ -843,13 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); - SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", - cfs_rq->idle_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", - cfs_rq->idle_h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", @@ -1295,8 +1292,10 @@ void resched_latency_warn(int cpu, u64 latency) { static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); - WARN(__ratelimit(&latency_check_ratelimit), - "sched: CPU %d need_resched set for > %llu ns (%d ticks) " - "without schedule\n", - cpu, latency, cpu_rq(cpu)->ticks_without_resched); + if (likely(!__ratelimit(&latency_check_ratelimit))) + return; + + pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); + dump_stack(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26958431deb7..1e78caa21436 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -51,6 +52,8 @@ #include +#include + #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -523,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -532,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -910,7 +913,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles. */ - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) @@ -1245,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { @@ -2126,7 +2129,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -3677,9 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; - if (se_is_idle(se)) - cfs_rq->idle_nr_running++; + cfs_rq->nr_queued++; } static void @@ -3692,9 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; - if (se_is_idle(se)) - cfs_rq->idle_nr_running--; + cfs_rq->nr_queued--; } /* @@ -5128,7 +5127,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return !cfs_rq->nr_running; + return !cfs_rq->nr_queued; } #define UPDATE_TG 0x0 @@ -5166,6 +5165,22 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ +void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_entity *se = &p->se; + + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + se->custom_slice = 1; + se->slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + se->custom_slice = 0; + se->slice = sysctl_sched_base_slice; + } +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5184,7 +5199,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5277,8 +5292,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -static inline bool cfs_bandwidth_used(void); - static void requeue_delayed_entity(struct sched_entity *se); @@ -5300,7 +5313,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5333,7 +5346,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); @@ -5375,7 +5388,7 @@ static void set_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_delayed++; + cfs_rq->h_nr_runnable--; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5387,7 +5400,7 @@ static void clear_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_delayed--; + cfs_rq->h_nr_runnable++; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5404,6 +5417,7 @@ static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; update_curr(cfs_rq); clear_buddies(cfs_rq, se); @@ -5429,7 +5443,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } - int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH; @@ -5437,7 +5450,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -5475,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); return true; @@ -5537,17 +5550,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); static struct sched_entity * pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { + struct sched_entity *se; + /* - * Enabling NEXT_BUDDY will affect latency but not fairness. + * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(NEXT_BUDDY) && + if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ SCHED_WARN_ON(cfs_rq->next->sched_delayed); return cfs_rq->next; } - struct sched_entity *se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* @@ -5823,7 +5838,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) list_del_leaf_cfs_rq(cfs_rq); SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) + if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -5836,8 +5851,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta, dequeue = 1; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5867,9 +5882,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5889,11 +5904,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5912,18 +5927,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); done: /* @@ -5932,7 +5947,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_running) + if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5942,8 +5957,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5976,9 +5991,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -5992,11 +6007,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6010,11 +6025,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6022,17 +6037,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_running && rq->cfs.h_nr_running) + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) + if (rq->curr == rq->idle && rq->cfs.nr_queued) resched_curr(rq); } @@ -6333,7 +6348,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -6604,6 +6619,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + /* * The rq clock has already been updated in the * set_rq_offline(), so we should skip updating @@ -6618,19 +6637,21 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (!cfs_rq->runtime_enabled) continue; - /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); @@ -6679,11 +6700,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* CONFIG_CFS_BANDWIDTH */ -static inline bool cfs_bandwidth_used(void) -{ - return false; -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -6741,7 +6757,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) SCHED_WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 slice = se->slice; s64 delta = slice - ran; @@ -6829,7 +6845,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { } /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -6856,14 +6872,14 @@ requeue_delayed_entity(struct sched_entity *se) if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); if (se->vlag > 0) { - cfs_rq->nr_running--; + cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; place_entity(cfs_rq, se, 0); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } } @@ -6881,10 +6897,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); - int h_nr_delayed = 0; + int h_nr_idle = task_has_idle_policy(p); + int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; /* @@ -6909,8 +6925,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); - if (task_new) - h_nr_delayed = !!se->sched_delayed; + if (task_new && se->sched_delayed) + h_nr_runnable = 0; for_each_sched_entity(se) { if (se->on_rq) { @@ -6932,12 +6948,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6956,19 +6972,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; } - if (!rq_h_nr_running && rq->cfs.h_nr_running) { + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { /* Account for idle runtime */ if (!rq->nr_running) dl_server_update_idle_time(rq, rq->curr); @@ -7015,22 +7031,22 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; - int idle_h_nr_running = 0; - int h_nr_running = 0; - int h_nr_delayed = 0; + int h_nr_idle = 0; + int h_nr_queued = 0; + int h_nr_runnable = 0; struct cfs_rq *cfs_rq; u64 slice = 0; if (entity_is_task(se)) { p = task_of(se); - h_nr_running = 1; - idle_h_nr_running = task_has_idle_policy(p); - if (!task_sleep && !task_delayed) - h_nr_delayed = !!se->sched_delayed; + h_nr_queued = 1; + h_nr_idle = task_has_idle_policy(p); + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7046,12 +7062,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } - cfs_rq->h_nr_running -= h_nr_running; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7085,21 +7101,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running -= h_nr_running; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) return 0; } - sub_nr_running(rq, h_nr_running); + sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ @@ -8788,7 +8804,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) again: cfs_rq = &rq->cfs; - if (!cfs_rq->nr_running) + if (!cfs_rq->nr_queued) return NULL; do { @@ -8905,7 +8921,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) { - return !!dl_se->rq->cfs.nr_running; + return !!dl_se->rq->cfs.nr_queued; } static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) @@ -9236,43 +9252,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -9283,37 +9299,77 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif +/* + * Check whether the task is ineligible on the destination cpu + * + * When the PLACE_LAG scheduling feature is enabled and + * dst_cfs_rq->nr_queued is greater than 1, if the task + * is ineligible, it will also be ineligible when + * it is migrated to the destination cpu. + */ +static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu) +{ + struct cfs_rq *dst_cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; +#else + dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; +#endif + if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && + !entity_eligible(task_cfs_rq(p), &p->se)) + return 1; + + return 0; +} + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* + * We want to prioritize the migration of eligible tasks. + * For ineligible tasks we soft-limit them and only allow + * them to migrate when nr_balance_failed is non-zero to + * avoid load-balancing trying very hard to balance the load. + */ + if (!env->sd->nr_balance_failed && + task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) + return 0; + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -9369,16 +9425,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->flags & LBF_ACTIVE_LB) return 1; - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->stats.nr_forced_migrations); - } + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) + p->sched_task_hot = 1; return 1; } @@ -9393,6 +9448,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9553,6 +9614,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } @@ -9695,7 +9759,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) @@ -10227,7 +10291,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_running != 1) + if (rq->cfs.h_nr_runnable != 1) return false; return check_cpu_capacity(rq, sd); @@ -10249,7 +10313,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *sg_overloaded, bool *sg_overutilized) { - int i, nr_running, local_group; + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; memset(sgs, 0, sizeof(*sgs)); @@ -10262,21 +10327,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; - if (nr_running > 1) - *sg_overloaded = 1; - if (cpu_overutilized(i)) *sg_overutilized = 1; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -10286,10 +10344,21 @@ static inline void update_sg_lb_stats(struct lb_env *env, continue; } + /* Overload indicator is only updated at root domain */ + if (balancing_at_rd && nr_running > 1) + *sg_overloaded = 1; + +#ifdef CONFIG_NUMA_BALANCING + /* Only fbq_classify_group() uses this to classify NUMA groups */ + if (sd_flags & SD_NUMA) { + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; + } +#endif if (local_group) continue; - if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + if (sd_flags & SD_ASYM_CPUCAPACITY) { /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -10577,7 +10646,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -11359,7 +11428,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_running; + nr_running = rq->cfs.h_nr_runnable; if (!nr_running) continue; @@ -11518,7 +11587,7 @@ static int need_active_balance(struct lb_env *env) * available on dst_cpu. */ if (env->idle && - (env->src_rq->cfs.h_nr_running == 1)) { + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -11598,6 +11667,28 @@ static int should_we_balance(struct lb_env *env) return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -11648,7 +11739,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -12146,16 +12237,13 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set - * anywhere yet. */ static inline int find_new_ilb(void) { const struct cpumask *hk_mask; int ilb_cpu; - hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { @@ -12173,7 +12261,8 @@ static inline int find_new_ilb(void) * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * - * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -12261,7 +12350,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12393,10 +12482,6 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -12616,13 +12701,6 @@ static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; - /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping - */ - if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) - return; - /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return; @@ -12759,11 +12837,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) pulled_task = -1; out: @@ -12784,9 +12862,9 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) /* * This softirq handler is triggered via SCHED_SOFTIRQ from two places: * - * - directly from the local scheduler_tick() for periodic load balancing + * - directly from the local sched_tick() for periodic load balancing * - * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * - indirectly from a remote sched_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void sched_balance_softirq(void) @@ -12877,7 +12955,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } @@ -13021,7 +13099,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* @@ -13431,7 +13509,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -13442,16 +13520,8 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; - if (se->on_rq) { - parent_cfs_rq = cfs_rq_of(se); - if (cfs_rq_is_idle(grp_cfs_rq)) - parent_cfs_rq->idle_nr_running++; - else - parent_cfs_rq->idle_nr_running--; - } - - idle_task_delta = grp_cfs_rq->h_nr_running - - grp_cfs_rq->idle_h_nr_running; + idle_task_delta = grp_cfs_rq->h_nr_queued - + grp_cfs_rq->h_nr_idle; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; @@ -13461,7 +13531,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (!se->on_rq) break; - cfs_rq->idle_h_nr_running += idle_task_delta; + cfs_rq->h_nr_idle += idle_task_delta; /* Already accounted at parent level and above. */ if (cfs_rq_is_idle(cfs_rq)) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a3d331dd2d8f..3c12d9f93331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -31,6 +31,15 @@ SCHED_FEAT(PREEMPT_SHORT, true) */ SCHED_FEAT(NEXT_BUDDY, false) +/* + * Allow completely ignoring cfs_rq->next; which can be set from various + * places: + * - NEXT_BUDDY (wakeup preemption) + * - yield_to_task() + * - cgroup dequeue / pick + */ +SCHED_FEAT(PICK_BUDDY, true) + /* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5891e715f00d..81bc8b329ef1 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -9,15 +9,9 @@ */ enum hk_flags { - HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), - HK_FLAG_RCU = BIT(HK_TYPE_RCU), - HK_FLAG_MISC = BIT(HK_TYPE_MISC), - HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), - HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), - HK_FLAG_WQ = BIT(HK_TYPE_WQ), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), - HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), + HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); @@ -97,7 +91,7 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overridden); - if (housekeeping.flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { @@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) unsigned int first_cpu; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); @@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) tick_nohz_full_setup(non_housekeeping_mask); housekeeping.flags |= flags; @@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | - HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags = HK_FLAG_KERNEL_NOISE; return housekeeping_setup(str, flags); } @@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str) int len; while (isalpha(*str)) { + /* + * isolcpus=nohz is equivalent to nohz_full. + */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK; + flags |= HK_FLAG_KERNEL_NOISE; continue; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fee75cc2c47b..7a8534a2deff 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_runnable * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, + cfs_rq->h_nr_runnable, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 84dad1511d1e..bb56805e3d47 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st s64 delta; u64 irq; - if (static_branch_likely(&psi_disabled)) + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; if (!curr->pid) @@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c5d67a43fe52..c7cf4cc57cdd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); +extern int dl_bw_deactivate(int cpu); extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following @@ -650,11 +650,10 @@ struct balance_callback { /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ - unsigned int h_nr_delayed; + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; @@ -904,11 +903,8 @@ struct dl_rq { static inline void se_update_runnable(struct sched_entity *se) { - if (!entity_is_task(se)) { - struct cfs_rq *cfs_rq = se->my_q; - - se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; - } + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) @@ -2280,7 +2276,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -2574,7 +2570,7 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); @@ -3242,6 +3238,12 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); + +static inline int irqtime_enabled(void) +{ + return static_branch_likely(&sched_clock_irqtime); +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. @@ -3262,6 +3264,13 @@ static inline u64 irq_time_read(int cpu) return total; } +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3509,6 +3518,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index eb0cdcd4d921..4346fd81c31f 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 16 +#define SCHEDSTAT_VERSION 17 static int show_schedstat(struct seq_file *seq, void *v) { @@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - seq_printf(seq, "domain%d %*pb", dcount++, + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, cpumask_pr_args(sched_domain_span(sd))); for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], - sd->lb_imbalance[itype], + sd->lb_imbalance_load[itype], + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8ee0add5a48a..6ade91bce63e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags) if (flags & ENQUEUE_RESTORE) return; + /* psi_sched_switch() will handle the flags */ + if (task_on_cpu(task_rq(p), p)) + return; + if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ff0e5ab4e37c..149e2c8036d3 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -300,20 +300,10 @@ static void __setscheduler_params(struct task_struct *p, p->policy = policy; - if (dl_policy(policy)) { + if (dl_policy(policy)) __setparam_dl(p, attr); - } else if (fair_policy(policy)) { - p->static_prio = NICE_TO_PRIO(attr->sched_nice); - if (attr->sched_runtime) { - p->se.custom_slice = 1; - p->se.slice = clamp_t(u64, attr->sched_runtime, - NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ - NSEC_PER_MSEC*100); /* HZ=100 / 10 */ - } else { - p->se.custom_slice = 0; - p->se.slice = sysctl_sched_base_slice; - } - } + else if (fair_policy(policy)) + __setparam_fair(p, attr); /* rt-policy tasks do not have a timerslack */ if (rt_or_dl_task_policy(p)) { @@ -1433,7 +1423,7 @@ int __sched yield_to(struct task_struct *p, bool preempt) struct rq *rq, *p_rq; int yielded = 0; - scoped_guard (irqsave) { + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); again: diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9748a4c8d668..da33ec9e94ab 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -2721,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], /* * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). + * its dl_bw->total_bw needs to be cleared. + * Tasks contribution will be then recomputed + * in function dl_update_tasks_root_domain(), + * dl_servers contribution in function + * dl_restore_server_root_domain(). */ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; dl_clear_root_domain(rd);