mirror of
https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-22 00:09:11 +03:00
Scheduler enhancements for v6.14:
- Fair scheduler (SCHED_FAIR) enhancements: - Behavioral improvements: - Untangle NEXT_BUDDY and pick_next_task() (Peter Zijlstra) - Delayed-dequeue enhancements & fixes: (Vincent Guittot) - Rename h_nr_running into h_nr_queued - Add new cfs_rq.h_nr_runnable - Use the new cfs_rq.h_nr_runnable - Removed unsued cfs_rq.h_nr_delayed - Rename cfs_rq.idle_h_nr_running into h_nr_idle - Remove unused cfs_rq.idle_nr_running - Rename cfs_rq.nr_running into nr_queued - Do not try to migrate delayed dequeue task - Fix variable declaration position - Encapsulate set custom slice in a __setparam_fair() function - Fixes: - Fix race between yield_to() and try_to_wake_up() (Tianchen Ding) - Fix CPU bandwidth limit bypass during CPU hotplug (Vishal Chourasia) - Cleanups: - Clean up in migrate_degrades_locality() to improve readability (Peter Zijlstra) - Mark m*_vruntime() with __maybe_unused (Andy Shevchenko) - Update comments after sched_tick() rename (Sebastian Andrzej Siewior) - Remove CONFIG_CFS_BANDWIDTH=n definition of cfs_bandwidth_used() (Valentin Schneider) - Deadline scheduler (SCHED_DL) enhancements: - Restore dl_server bandwidth on non-destructive root domain changes (Juri Lelli) - Correctly account for allocated bandwidth during hotplug (Juri Lelli) - Check bandwidth overflow earlier for hotplug (Juri Lelli) - Clean up goto label in pick_earliest_pushable_dl_task() (John Stultz) - Consolidate timer cancellation (Wander Lairson Costa) - Load-balancer enhancements: - Improve performance by prioritizing migrating eligible tasks in sched_balance_rq() (Hao Jia) - Do not compute NUMA Balancing stats unnecessarily during load-balancing (K Prateek Nayak) - Do not compute overloaded status unnecessarily during load-balancing (K Prateek Nayak) - Generic scheduling code enhancements: - Use READ_ONCE() in task_on_rq_queued(), to consistently use the WRITE_ONCE() updated ->on_rq field (Harshit Agarwal) - Isolated CPUs support enhancements: (Waiman Long) - Make "isolcpus=nohz" equivalent to "nohz_full" - Consolidate housekeeping cpumasks that are always identical - Remove HK_TYPE_SCHED - Unify HK_TYPE_{TIMER|TICK|MISC} to HK_TYPE_KERNEL_NOISE - RSEQ enhancements: - Validate read-only fields under DEBUG_RSEQ config (Mathieu Desnoyers) - PSI enhancements: - Fix race when task wakes up before psi_sched_switch() adjusts flags (Chengming Zhou) - IRQ time accounting performance enhancements: (Yafang Shao) - Define sched_clock_irqtime as static key - Don't account irq time if sched_clock_irqtime is disabled - Virtual machine scheduling enhancements: - Don't try to catch up excess steal time (Suleiman Souhlal) - Heterogenous x86 CPU scheduling enhancements: (K Prateek Nayak) - Convert "sysctl_sched_itmt_enabled" to boolean - Use guard() for itmt_update_mutex - Move the "sched_itmt_enabled" sysctl to debugfs - Remove x86_smt_flags and use cpu_smt_flags directly - Use x86_sched_itmt_flags for PKG domain unconditionally - Debugging code & instrumentation enhancements: - Change need_resched warnings to pr_err() (David Rientjes) - Print domain name in /proc/schedstat (K Prateek Nayak) - Fix value reported by hot tasks pulled in /proc/schedstat (Peter Zijlstra) - Report the different kinds of imbalances in /proc/schedstat (Swapnil Sapkal) - Move sched domain name out of CONFIG_SCHED_DEBUG (Swapnil Sapkal) - Update Schedstat version to 17 (Swapnil Sapkal) Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmePSRcRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1hrdBAAjYiLl5Q8SHM0xnl+kbvuUkCTgEB/gSgA mfrZtHRUgRZuA89NZ9NljlCkQSlsLTOjnpNuaeFzs529GMg9iemc99dbnz3BP5F3 V5qpYvWe7yIkJ3hd0TOGLmYEPMNQaAW57YBOrxcPjWNLJ4cr9iMdccVA1OQtcmqD ZUh3nibv81QI8HDmT2G+figxEIqH3yBV1+SmEIxbrdkQpIJ5702Ng6+0KQK5TShN xwjFELWZUl2TfkoCc4nkIpkImV6cI1DvXSw1xK6gbb1xEVOrsmFW3TYFw4trKHBu 2RBG4wtmzNjh+12GmSdIBJHogPNcay+JIJW9EG/unT7jirqzkkeP1X2eJEbh+X1L CMa7GsD9Vy72jCzeJDMuiy7bKfG/MiKUtDXrAZQDo2atbw7H88QOzMuTE5a5WSV+ tRxXGI/dgFVOk+JQUfctfJbYeXjmG8GAflawvXtGDAfDZsja6M+65fH8p0AOgW1E HHmXUzAe2E2xQBiSok/DYHPQeCDBAjoJvU93YhGiXv8UScb2UaD4BAfzfmc8P+Zs Eox6444ah5U0jiXmZ3HU707n1zO+Ql4qKoyyMJzSyP+oYHE/Do7NYTElw2QovVdN FX/9Uae8T4ttA/5lFe7FNoXgKvSxXDKYyKLZcysjVrWJF866Ui/TWtmxA6w8Osn7 sfucuLawLPM= =5ZNW -----END PGP SIGNATURE----- Merge tag 'sched-core-2025-01-21' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: "Fair scheduler (SCHED_FAIR) enhancements: - Behavioral improvements: - Untangle NEXT_BUDDY and pick_next_task() (Peter Zijlstra) - Delayed-dequeue enhancements & fixes: (Vincent Guittot) - Rename h_nr_running into h_nr_queued - Add new cfs_rq.h_nr_runnable - Use the new cfs_rq.h_nr_runnable - Removed unsued cfs_rq.h_nr_delayed - Rename cfs_rq.idle_h_nr_running into h_nr_idle - Remove unused cfs_rq.idle_nr_running - Rename cfs_rq.nr_running into nr_queued - Do not try to migrate delayed dequeue task - Fix variable declaration position - Encapsulate set custom slice in a __setparam_fair() function - Fixes: - Fix race between yield_to() and try_to_wake_up() (Tianchen Ding) - Fix CPU bandwidth limit bypass during CPU hotplug (Vishal Chourasia) - Cleanups: - Clean up in migrate_degrades_locality() to improve readability (Peter Zijlstra) - Mark m*_vruntime() with __maybe_unused (Andy Shevchenko) - Update comments after sched_tick() rename (Sebastian Andrzej Siewior) - Remove CONFIG_CFS_BANDWIDTH=n definition of cfs_bandwidth_used() (Valentin Schneider) Deadline scheduler (SCHED_DL) enhancements: - Restore dl_server bandwidth on non-destructive root domain changes (Juri Lelli) - Correctly account for allocated bandwidth during hotplug (Juri Lelli) - Check bandwidth overflow earlier for hotplug (Juri Lelli) - Clean up goto label in pick_earliest_pushable_dl_task() (John Stultz) - Consolidate timer cancellation (Wander Lairson Costa) Load-balancer enhancements: - Improve performance by prioritizing migrating eligible tasks in sched_balance_rq() (Hao Jia) - Do not compute NUMA Balancing stats unnecessarily during load-balancing (K Prateek Nayak) - Do not compute overloaded status unnecessarily during load-balancing (K Prateek Nayak) Generic scheduling code enhancements: - Use READ_ONCE() in task_on_rq_queued(), to consistently use the WRITE_ONCE() updated ->on_rq field (Harshit Agarwal) Isolated CPUs support enhancements: (Waiman Long) - Make "isolcpus=nohz" equivalent to "nohz_full" - Consolidate housekeeping cpumasks that are always identical - Remove HK_TYPE_SCHED - Unify HK_TYPE_{TIMER|TICK|MISC} to HK_TYPE_KERNEL_NOISE RSEQ enhancements: - Validate read-only fields under DEBUG_RSEQ config (Mathieu Desnoyers) PSI enhancements: - Fix race when task wakes up before psi_sched_switch() adjusts flags (Chengming Zhou) IRQ time accounting performance enhancements: (Yafang Shao) - Define sched_clock_irqtime as static key - Don't account irq time if sched_clock_irqtime is disabled Virtual machine scheduling enhancements: - Don't try to catch up excess steal time (Suleiman Souhlal) Heterogenous x86 CPU scheduling enhancements: (K Prateek Nayak) - Convert "sysctl_sched_itmt_enabled" to boolean - Use guard() for itmt_update_mutex - Move the "sched_itmt_enabled" sysctl to debugfs - Remove x86_smt_flags and use cpu_smt_flags directly - Use x86_sched_itmt_flags for PKG domain unconditionally Debugging code & instrumentation enhancements: - Change need_resched warnings to pr_err() (David Rientjes) - Print domain name in /proc/schedstat (K Prateek Nayak) - Fix value reported by hot tasks pulled in /proc/schedstat (Peter Zijlstra) - Report the different kinds of imbalances in /proc/schedstat (Swapnil Sapkal) - Move sched domain name out of CONFIG_SCHED_DEBUG (Swapnil Sapkal) - Update Schedstat version to 17 (Swapnil Sapkal)" * tag 'sched-core-2025-01-21' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (48 commits) rseq: Fix rseq unregistration regression psi: Fix race when task wakes up before psi_sched_switch() adjusts flags sched, psi: Don't account irq time if sched_clock_irqtime is disabled sched: Don't account irq time if sched_clock_irqtime is disabled sched: Define sched_clock_irqtime as static key sched/fair: Do not compute overloaded status unnecessarily during lb sched/fair: Do not compute NUMA Balancing stats unnecessarily during lb x86/topology: Use x86_sched_itmt_flags for PKG domain unconditionally x86/topology: Remove x86_smt_flags and use cpu_smt_flags directly x86/itmt: Move the "sched_itmt_enabled" sysctl to debugfs x86/itmt: Use guard() for itmt_update_mutex x86/itmt: Convert "sysctl_sched_itmt_enabled" to boolean sched/core: Prioritize migrating eligible tasks in sched_balance_rq() sched/debug: Change need_resched warnings to pr_err sched/fair: Encapsulate set custom slice in a __setparam_fair() function sched: Fix race between yield_to() and try_to_wake_up() docs: Update Schedstat version to 17 sched/stats: Print domain name in /proc/schedstat sched: Move sched domain name out of CONFIG_SCHED_DEBUG sched: Report the different kinds of imbalances in /proc/schedstat ...
This commit is contained in:
commit
62de6e1685
@ -2506,7 +2506,9 @@
|
||||
specified in the flag list (default: domain):
|
||||
|
||||
nohz
|
||||
Disable the tick when a single task runs.
|
||||
Disable the tick when a single task runs as well as
|
||||
disabling other kernel noises like having RCU callbacks
|
||||
offloaded. This is equivalent to the nohz_full parameter.
|
||||
|
||||
A residual 1Hz tick is offloaded to workqueues, which you
|
||||
need to affine to housekeeping through the global
|
||||
|
@ -2,6 +2,12 @@
|
||||
Scheduler Statistics
|
||||
====================
|
||||
|
||||
Version 17 of schedstats removed 'lb_imbalance' field as it has no
|
||||
significance anymore and instead added more relevant fields namely
|
||||
'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and
|
||||
'lb_imbalance_misfit'. The domain field prints the name of the
|
||||
corresponding sched domain from this version onwards.
|
||||
|
||||
Version 16 of schedstats changed the order of definitions within
|
||||
'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES]
|
||||
columns in show_schedstat(). In particular the position of CPU_IDLE
|
||||
@ -9,7 +15,9 @@ and __CPU_NOT_IDLE changed places. The size of the array is unchanged.
|
||||
|
||||
Version 15 of schedstats dropped counters for some sched_yield:
|
||||
yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is
|
||||
identical to version 14.
|
||||
identical to version 14. Details are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-stats.txt?id=1e1dbb259c79b
|
||||
|
||||
Version 14 of schedstats includes support for sched_domains, which hit the
|
||||
mainline kernel in 2.6.20 although it is identical to the stats from version
|
||||
@ -26,7 +34,14 @@ cpus on the machine, while domain0 is the most tightly focused domain,
|
||||
sometimes balancing only between pairs of cpus. At this time, there
|
||||
are no architectures which need more than three domain levels. The first
|
||||
field in the domain stats is a bit map indicating which cpus are affected
|
||||
by that domain.
|
||||
by that domain. Details are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=b762f3ffb797c
|
||||
|
||||
The schedstat documentation is maintained version 10 onwards and is not
|
||||
updated for version 11 and 12. The details for version 10 are available at
|
||||
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=1da177e4c3f4
|
||||
|
||||
These fields are counters, and only increment. Programs which make use
|
||||
of these will need to start with a baseline observation and then calculate
|
||||
@ -71,88 +86,97 @@ Domain statistics
|
||||
-----------------
|
||||
One of these is produced per domain for each cpu described. (Note that if
|
||||
CONFIG_SMP is not defined, *no* domains are utilized and these lines
|
||||
will not appear in the output.)
|
||||
will not appear in the output. <name> is an extension to the domain field
|
||||
that prints the name of the corresponding sched domain. It can appear in
|
||||
schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.)
|
||||
|
||||
domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
|
||||
domain<N> <name> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
||||
|
||||
The first field is a bit mask indicating what cpus this domain operates over.
|
||||
|
||||
The next 24 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (idle, busy, and newly idle):
|
||||
The next 33 are a variety of sched_balance_rq() statistics in grouped into types
|
||||
of idleness (busy, idle and newly idle):
|
||||
|
||||
1) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
2) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
4) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was idle
|
||||
5) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
6) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
7) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
8) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
9) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was busy
|
||||
10) # of times in this domain sched_balance_rq() checked but found the
|
||||
2) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when busy
|
||||
11) # of times in this domain sched_balance_rq() tried to move one or
|
||||
3) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was busy
|
||||
12) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was busy
|
||||
13) # of times in this domain pull_task() was called when busy
|
||||
14) # of times in this domain pull_task() was called even though the
|
||||
4) Total imbalance in load when the cpu was busy
|
||||
5) Total imbalance in utilization when the cpu was busy
|
||||
6) Total imbalance in number of tasks when the cpu was busy
|
||||
7) Total imbalance due to misfit tasks when the cpu was busy
|
||||
8) # of times in this domain pull_task() was called when busy
|
||||
9) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when busy
|
||||
15) # of times in this domain sched_balance_rq() was called but did not
|
||||
10) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was busy
|
||||
16) # of times in this domain a busier queue was found while the cpu
|
||||
11) # of times in this domain a busier queue was found while the cpu
|
||||
was busy but no busier group was found
|
||||
|
||||
17) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was just becoming idle
|
||||
18) # of times in this domain sched_balance_rq() checked but found the
|
||||
12) # of times in this domain sched_balance_rq() was called when the
|
||||
cpu was idle
|
||||
13) # of times in this domain sched_balance_rq() checked but found
|
||||
the load did not require balancing when the cpu was idle
|
||||
14) # of times in this domain sched_balance_rq() tried to move one or
|
||||
more tasks and failed, when the cpu was idle
|
||||
15) Total imbalance in load when the cpu was idle
|
||||
16) Total imbalance in utilization when the cpu was idle
|
||||
17) Total imbalance in number of tasks when the cpu was idle
|
||||
18) Total imbalance due to misfit tasks when the cpu was idle
|
||||
19) # of times in this domain pull_task() was called when the cpu
|
||||
was idle
|
||||
20) # of times in this domain pull_task() was called even though
|
||||
the target task was cache-hot when idle
|
||||
21) # of times in this domain sched_balance_rq() was called but did
|
||||
not find a busier queue while the cpu was idle
|
||||
22) # of times in this domain a busier queue was found while the
|
||||
cpu was idle but no busier group was found
|
||||
|
||||
23) # of times in this domain sched_balance_rq() was called when the
|
||||
was just becoming idle
|
||||
24) # of times in this domain sched_balance_rq() checked but found the
|
||||
load did not require balancing when the cpu was just becoming idle
|
||||
19) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
25) # of times in this domain sched_balance_rq() tried to move one or more
|
||||
tasks and failed, when the cpu was just becoming idle
|
||||
20) sum of imbalances discovered (if any) with each call to
|
||||
sched_balance_rq() in this domain when the cpu was just becoming idle
|
||||
21) # of times in this domain pull_task() was called when newly idle
|
||||
22) # of times in this domain pull_task() was called even though the
|
||||
26) Total imbalance in load when the cpu was just becoming idle
|
||||
27) Total imbalance in utilization when the cpu was just becoming idle
|
||||
28) Total imbalance in number of tasks when the cpu was just becoming idle
|
||||
29) Total imbalance due to misfit tasks when the cpu was just becoming idle
|
||||
30) # of times in this domain pull_task() was called when newly idle
|
||||
31) # of times in this domain pull_task() was called even though the
|
||||
target task was cache-hot when just becoming idle
|
||||
23) # of times in this domain sched_balance_rq() was called but did not
|
||||
32) # of times in this domain sched_balance_rq() was called but did not
|
||||
find a busier queue while the cpu was just becoming idle
|
||||
24) # of times in this domain a busier queue was found while the cpu
|
||||
33) # of times in this domain a busier queue was found while the cpu
|
||||
was just becoming idle but no busier group was found
|
||||
|
||||
Next three are active_load_balance() statistics:
|
||||
|
||||
25) # of times active_load_balance() was called
|
||||
26) # of times active_load_balance() tried to move a task and failed
|
||||
27) # of times active_load_balance() successfully moved a task
|
||||
34) # of times active_load_balance() was called
|
||||
35) # of times active_load_balance() tried to move a task and failed
|
||||
36) # of times active_load_balance() successfully moved a task
|
||||
|
||||
Next three are sched_balance_exec() statistics:
|
||||
|
||||
28) sbe_cnt is not used
|
||||
29) sbe_balanced is not used
|
||||
30) sbe_pushed is not used
|
||||
37) sbe_cnt is not used
|
||||
38) sbe_balanced is not used
|
||||
39) sbe_pushed is not used
|
||||
|
||||
Next three are sched_balance_fork() statistics:
|
||||
|
||||
31) sbf_cnt is not used
|
||||
32) sbf_balanced is not used
|
||||
33) sbf_pushed is not used
|
||||
40) sbf_cnt is not used
|
||||
41) sbf_balanced is not used
|
||||
42) sbf_pushed is not used
|
||||
|
||||
Next three are try_to_wake_up() statistics:
|
||||
|
||||
34) # of times in this domain try_to_wake_up() awoke a task that
|
||||
43) # of times in this domain try_to_wake_up() awoke a task that
|
||||
last ran on a different cpu in this domain
|
||||
35) # of times in this domain try_to_wake_up() moved a task to the
|
||||
44) # of times in this domain try_to_wake_up() moved a task to the
|
||||
waking cpu because it was cache-cold on its own cpu anyway
|
||||
36) # of times in this domain try_to_wake_up() started passive balancing
|
||||
45) # of times in this domain try_to_wake_up() started passive balancing
|
||||
|
||||
/proc/<pid>/schedstat
|
||||
---------------------
|
||||
|
@ -251,7 +251,7 @@ extern bool x86_topology_update;
|
||||
#include <asm/percpu.h>
|
||||
|
||||
DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
|
||||
extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
|
||||
extern bool __read_mostly sysctl_sched_itmt_enabled;
|
||||
|
||||
/* Interface to set priority of a cpu */
|
||||
void sched_set_itmt_core_prio(int prio, int core_cpu);
|
||||
@ -264,7 +264,7 @@ void sched_clear_itmt_support(void);
|
||||
|
||||
#else /* CONFIG_SCHED_MC_PRIO */
|
||||
|
||||
#define sysctl_sched_itmt_enabled 0
|
||||
#define sysctl_sched_itmt_enabled false
|
||||
static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
|
||||
{
|
||||
}
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/nodemask.h>
|
||||
@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable;
|
||||
* of higher turbo frequency for cpus supporting Intel Turbo Boost Max
|
||||
* Technology 3.0.
|
||||
*
|
||||
* It can be set via /proc/sys/kernel/sched_itmt_enabled
|
||||
* It can be set via /sys/kernel/debug/x86/sched_itmt_enabled
|
||||
*/
|
||||
unsigned int __read_mostly sysctl_sched_itmt_enabled;
|
||||
bool __read_mostly sysctl_sched_itmt_enabled;
|
||||
|
||||
static int sched_itmt_update_handler(const struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
static ssize_t sched_itmt_enabled_write(struct file *filp,
|
||||
const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
unsigned int old_sysctl;
|
||||
int ret;
|
||||
ssize_t result;
|
||||
bool orig;
|
||||
|
||||
mutex_lock(&itmt_update_mutex);
|
||||
guard(mutex)(&itmt_update_mutex);
|
||||
|
||||
if (!sched_itmt_capable) {
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
orig = sysctl_sched_itmt_enabled;
|
||||
result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
|
||||
|
||||
old_sysctl = sysctl_sched_itmt_enabled;
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
|
||||
if (sysctl_sched_itmt_enabled != orig) {
|
||||
x86_topology_update = true;
|
||||
rebuild_sched_domains();
|
||||
}
|
||||
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
|
||||
return ret;
|
||||
return result;
|
||||
}
|
||||
|
||||
static struct ctl_table itmt_kern_table[] = {
|
||||
{
|
||||
.procname = "sched_itmt_enabled",
|
||||
.data = &sysctl_sched_itmt_enabled,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_itmt_update_handler,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
static const struct file_operations dfs_sched_itmt_fops = {
|
||||
.read = debugfs_read_file_bool,
|
||||
.write = sched_itmt_enabled_write,
|
||||
.open = simple_open,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
static struct ctl_table_header *itmt_sysctl_header;
|
||||
static struct dentry *dfs_sched_itmt;
|
||||
|
||||
/**
|
||||
* sched_set_itmt_support() - Indicate platform supports ITMT
|
||||
@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header;
|
||||
*/
|
||||
int sched_set_itmt_support(void)
|
||||
{
|
||||
mutex_lock(&itmt_update_mutex);
|
||||
guard(mutex)(&itmt_update_mutex);
|
||||
|
||||
if (sched_itmt_capable) {
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
if (sched_itmt_capable)
|
||||
return 0;
|
||||
}
|
||||
|
||||
itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table);
|
||||
if (!itmt_sysctl_header) {
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled",
|
||||
0644,
|
||||
arch_debugfs_dir,
|
||||
&sysctl_sched_itmt_enabled,
|
||||
&dfs_sched_itmt_fops);
|
||||
if (IS_ERR_OR_NULL(dfs_sched_itmt)) {
|
||||
dfs_sched_itmt = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@ -117,8 +109,6 @@ int sched_set_itmt_support(void)
|
||||
x86_topology_update = true;
|
||||
rebuild_sched_domains();
|
||||
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -134,18 +124,15 @@ int sched_set_itmt_support(void)
|
||||
*/
|
||||
void sched_clear_itmt_support(void)
|
||||
{
|
||||
mutex_lock(&itmt_update_mutex);
|
||||
guard(mutex)(&itmt_update_mutex);
|
||||
|
||||
if (!sched_itmt_capable) {
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
if (!sched_itmt_capable)
|
||||
return;
|
||||
}
|
||||
|
||||
sched_itmt_capable = false;
|
||||
|
||||
if (itmt_sysctl_header) {
|
||||
unregister_sysctl_table(itmt_sysctl_header);
|
||||
itmt_sysctl_header = NULL;
|
||||
}
|
||||
debugfs_remove(dfs_sched_itmt);
|
||||
dfs_sched_itmt = NULL;
|
||||
|
||||
if (sysctl_sched_itmt_enabled) {
|
||||
/* disable sched_itmt if we are no longer ITMT capable */
|
||||
@ -153,8 +140,6 @@ void sched_clear_itmt_support(void)
|
||||
x86_topology_update = true;
|
||||
rebuild_sched_domains();
|
||||
}
|
||||
|
||||
mutex_unlock(&itmt_update_mutex);
|
||||
}
|
||||
|
||||
int arch_asym_cpu_priority(int cpu)
|
||||
|
@ -483,12 +483,6 @@ static int x86_core_flags(void)
|
||||
return cpu_core_flags() | x86_sched_itmt_flags();
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static int x86_smt_flags(void)
|
||||
{
|
||||
return cpu_smt_flags();
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_CLUSTER
|
||||
static int x86_cluster_flags(void)
|
||||
{
|
||||
@ -496,15 +490,6 @@ static int x86_cluster_flags(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
static int x86_die_flags(void)
|
||||
{
|
||||
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU) ||
|
||||
cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
|
||||
return x86_sched_itmt_flags();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set if a package/die has multiple NUMA nodes inside.
|
||||
* AMD Magny-Cours, Intel Cluster-on-Die, and Intel
|
||||
@ -520,7 +505,7 @@ static void __init build_sched_topology(void)
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
x86_topology[i++] = (struct sched_domain_topology_level){
|
||||
cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
|
||||
cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
|
||||
};
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_CLUSTER
|
||||
@ -540,7 +525,7 @@ static void __init build_sched_topology(void)
|
||||
*/
|
||||
if (!x86_has_numa_in_package) {
|
||||
x86_topology[i++] = (struct sched_domain_topology_level){
|
||||
cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
|
||||
cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -944,6 +944,7 @@ struct task_struct {
|
||||
unsigned sched_reset_on_fork:1;
|
||||
unsigned sched_contributes_to_load:1;
|
||||
unsigned sched_migrated:1;
|
||||
unsigned sched_task_hot:1;
|
||||
|
||||
/* Force alignment to the next boundary: */
|
||||
unsigned :0;
|
||||
@ -1374,6 +1375,15 @@ struct task_struct {
|
||||
* with respect to preemption.
|
||||
*/
|
||||
unsigned long rseq_event_mask;
|
||||
# ifdef CONFIG_DEBUG_RSEQ
|
||||
/*
|
||||
* This is a place holder to save a copy of the rseq fields for
|
||||
* validation of read-only fields. The struct rseq has a
|
||||
* variable-length array at the end, so it cannot be used
|
||||
* directly. Reserve a size large enough for the known fields.
|
||||
*/
|
||||
char rseq_fields[sizeof(struct rseq)];
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_MM_CID
|
||||
|
@ -7,16 +7,21 @@
|
||||
#include <linux/tick.h>
|
||||
|
||||
enum hk_type {
|
||||
HK_TYPE_TIMER,
|
||||
HK_TYPE_RCU,
|
||||
HK_TYPE_MISC,
|
||||
HK_TYPE_SCHED,
|
||||
HK_TYPE_TICK,
|
||||
HK_TYPE_DOMAIN,
|
||||
HK_TYPE_WQ,
|
||||
HK_TYPE_MANAGED_IRQ,
|
||||
HK_TYPE_KTHREAD,
|
||||
HK_TYPE_MAX
|
||||
HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_MAX,
|
||||
|
||||
/*
|
||||
* The following housekeeping types are only set by the nohz_full
|
||||
* boot commandline option. So they can share the same value.
|
||||
*/
|
||||
HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE,
|
||||
HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CPU_ISOLATION
|
||||
|
@ -114,7 +114,10 @@ struct sched_domain {
|
||||
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
|
||||
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
|
||||
@ -140,9 +143,7 @@ struct sched_domain {
|
||||
unsigned int ttwu_move_affine;
|
||||
unsigned int ttwu_move_balance;
|
||||
#endif
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
char *name;
|
||||
#endif
|
||||
union {
|
||||
void *private; /* used during construction */
|
||||
struct rcu_head rcu; /* used during destruction */
|
||||
@ -198,18 +199,12 @@ struct sched_domain_topology_level {
|
||||
int flags;
|
||||
int numa_level;
|
||||
struct sd_data data;
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
char *name;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
# define SD_INIT_NAME(type) .name = #type
|
||||
#else
|
||||
# define SD_INIT_NAME(type)
|
||||
#endif
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/rseq.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <asm/ptrace.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
@ -25,6 +26,78 @@
|
||||
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
|
||||
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
|
||||
|
||||
#ifdef CONFIG_DEBUG_RSEQ
|
||||
static struct rseq *rseq_kernel_fields(struct task_struct *t)
|
||||
{
|
||||
return (struct rseq *) t->rseq_fields;
|
||||
}
|
||||
|
||||
static int rseq_validate_ro_fields(struct task_struct *t)
|
||||
{
|
||||
static DEFINE_RATELIMIT_STATE(_rs,
|
||||
DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
u32 cpu_id_start, cpu_id, node_id, mm_cid;
|
||||
struct rseq __user *rseq = t->rseq;
|
||||
|
||||
/*
|
||||
* Validate fields which are required to be read-only by
|
||||
* user-space.
|
||||
*/
|
||||
if (!user_read_access_begin(rseq, t->rseq_len))
|
||||
goto efault;
|
||||
unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
|
||||
unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
|
||||
unsafe_get_user(node_id, &rseq->node_id, efault_end);
|
||||
unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
|
||||
user_read_access_end();
|
||||
|
||||
if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
|
||||
cpu_id != rseq_kernel_fields(t)->cpu_id ||
|
||||
node_id != rseq_kernel_fields(t)->node_id ||
|
||||
mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
|
||||
|
||||
pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
|
||||
"\tcpu_id_start: %u ?= %u\n"
|
||||
"\tcpu_id: %u ?= %u\n"
|
||||
"\tnode_id: %u ?= %u\n"
|
||||
"\tmm_cid: %u ?= %u\n",
|
||||
t->pid, t->comm,
|
||||
cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
|
||||
cpu_id, rseq_kernel_fields(t)->cpu_id,
|
||||
node_id, rseq_kernel_fields(t)->node_id,
|
||||
mm_cid, rseq_kernel_fields(t)->mm_cid);
|
||||
}
|
||||
|
||||
/* For now, only print a console warning on mismatch. */
|
||||
return 0;
|
||||
|
||||
efault_end:
|
||||
user_read_access_end();
|
||||
efault:
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
|
||||
u32 node_id, u32 mm_cid)
|
||||
{
|
||||
rseq_kernel_fields(t)->cpu_id_start = cpu_id;
|
||||
rseq_kernel_fields(t)->cpu_id = cpu_id;
|
||||
rseq_kernel_fields(t)->node_id = node_id;
|
||||
rseq_kernel_fields(t)->mm_cid = mm_cid;
|
||||
}
|
||||
#else
|
||||
static int rseq_validate_ro_fields(struct task_struct *t)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
|
||||
u32 node_id, u32 mm_cid)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
*
|
||||
* Restartable sequences are a lightweight interface that allows
|
||||
@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
|
||||
u32 node_id = cpu_to_node(cpu_id);
|
||||
u32 mm_cid = task_mm_cid(t);
|
||||
|
||||
/*
|
||||
* Validate read-only rseq fields.
|
||||
*/
|
||||
if (rseq_validate_ro_fields(t))
|
||||
goto efault;
|
||||
WARN_ON_ONCE((int) mm_cid < 0);
|
||||
if (!user_write_access_begin(rseq, t->rseq_len))
|
||||
goto efault;
|
||||
@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
|
||||
* t->rseq_len != ORIG_RSEQ_SIZE.
|
||||
*/
|
||||
user_write_access_end();
|
||||
rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
|
||||
trace_rseq_update(t);
|
||||
return 0;
|
||||
|
||||
@ -119,6 +198,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
|
||||
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
|
||||
mm_cid = 0;
|
||||
|
||||
/*
|
||||
* Validate read-only rseq fields.
|
||||
*/
|
||||
if (rseq_validate_ro_fields(t))
|
||||
return -EFAULT;
|
||||
/*
|
||||
* Reset cpu_id_start to its initial state (0).
|
||||
*/
|
||||
@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
|
||||
*/
|
||||
if (put_user(mm_cid, &t->rseq->mm_cid))
|
||||
return -EFAULT;
|
||||
|
||||
rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
|
||||
|
||||
/*
|
||||
* Additional feature fields added after ORIG_RSEQ_SIZE
|
||||
* need to be conditionally reset only if
|
||||
@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
|
||||
current->rseq = rseq;
|
||||
current->rseq_len = rseq_len;
|
||||
current->rseq_sig = sig;
|
||||
#ifdef CONFIG_DEBUG_RSEQ
|
||||
/*
|
||||
* Initialize the in-kernel rseq fields copy for validation of
|
||||
* read-only fields.
|
||||
*/
|
||||
if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
|
||||
get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
|
||||
get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
|
||||
get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
|
||||
return -EFAULT;
|
||||
#endif
|
||||
/*
|
||||
* If rseq was previously inactive, and has just been
|
||||
* registered, ensure the cpu_id_start and cpu_id fields
|
||||
|
@ -740,39 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
||||
s64 __maybe_unused steal = 0, irq_delta = 0;
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
|
||||
if (irqtime_enabled()) {
|
||||
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
|
||||
|
||||
/*
|
||||
* Since irq_time is only updated on {soft,}irq_exit, we might run into
|
||||
* this case when a previous update_rq_clock() happened inside a
|
||||
* {soft,}IRQ region.
|
||||
*
|
||||
* When this happens, we stop ->clock_task and only update the
|
||||
* prev_irq_time stamp to account for the part that fit, so that a next
|
||||
* update will consume the rest. This ensures ->clock_task is
|
||||
* monotonic.
|
||||
*
|
||||
* It does however cause some slight miss-attribution of {soft,}IRQ
|
||||
* time, a more accurate solution would be to update the irq_time using
|
||||
* the current rq->clock timestamp, except that would require using
|
||||
* atomic ops.
|
||||
*/
|
||||
if (irq_delta > delta)
|
||||
irq_delta = delta;
|
||||
/*
|
||||
* Since irq_time is only updated on {soft,}irq_exit, we might run into
|
||||
* this case when a previous update_rq_clock() happened inside a
|
||||
* {soft,}IRQ region.
|
||||
*
|
||||
* When this happens, we stop ->clock_task and only update the
|
||||
* prev_irq_time stamp to account for the part that fit, so that a next
|
||||
* update will consume the rest. This ensures ->clock_task is
|
||||
* monotonic.
|
||||
*
|
||||
* It does however cause some slight miss-attribution of {soft,}IRQ
|
||||
* time, a more accurate solution would be to update the irq_time using
|
||||
* the current rq->clock timestamp, except that would require using
|
||||
* atomic ops.
|
||||
*/
|
||||
if (irq_delta > delta)
|
||||
irq_delta = delta;
|
||||
|
||||
rq->prev_irq_time += irq_delta;
|
||||
delta -= irq_delta;
|
||||
delayacct_irq(rq->curr, irq_delta);
|
||||
rq->prev_irq_time += irq_delta;
|
||||
delta -= irq_delta;
|
||||
delayacct_irq(rq->curr, irq_delta);
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
||||
if (static_key_false((¶virt_steal_rq_enabled))) {
|
||||
steal = paravirt_steal_clock(cpu_of(rq));
|
||||
u64 prev_steal;
|
||||
|
||||
steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
|
||||
steal -= rq->prev_steal_time_rq;
|
||||
|
||||
if (unlikely(steal > delta))
|
||||
steal = delta;
|
||||
|
||||
rq->prev_steal_time_rq += steal;
|
||||
rq->prev_steal_time_rq = prev_steal;
|
||||
delta -= steal;
|
||||
}
|
||||
#endif
|
||||
@ -1168,13 +1172,13 @@ int get_nohz_timer_target(void)
|
||||
struct sched_domain *sd;
|
||||
const struct cpumask *hk_mask;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
|
||||
if (!idle_cpu(cpu))
|
||||
return cpu;
|
||||
default_cpu = cpu;
|
||||
}
|
||||
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
|
||||
|
||||
guard(rcu)();
|
||||
|
||||
@ -1189,7 +1193,7 @@ int get_nohz_timer_target(void)
|
||||
}
|
||||
|
||||
if (default_cpu == -1)
|
||||
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
|
||||
default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
|
||||
|
||||
return default_cpu;
|
||||
}
|
||||
@ -1341,7 +1345,7 @@ bool sched_can_stop_tick(struct rq *rq)
|
||||
if (scx_enabled() && !scx_can_stop_tick(rq))
|
||||
return false;
|
||||
|
||||
if (rq->cfs.h_nr_running > 1)
|
||||
if (rq->cfs.h_nr_queued > 1)
|
||||
return false;
|
||||
|
||||
/*
|
||||
@ -5632,7 +5636,7 @@ void sched_tick(void)
|
||||
unsigned long hw_pressure;
|
||||
u64 resched_latency;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
|
||||
arch_scale_freq_tick();
|
||||
|
||||
sched_clock_tick();
|
||||
@ -5771,7 +5775,7 @@ static void sched_tick_start(int cpu)
|
||||
int os;
|
||||
struct tick_work *twork;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!tick_work_cpu);
|
||||
@ -5792,7 +5796,7 @@ static void sched_tick_stop(int cpu)
|
||||
struct tick_work *twork;
|
||||
int os;
|
||||
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
|
||||
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!tick_work_cpu);
|
||||
@ -6018,7 +6022,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* opportunity to pull in more work from other CPUs.
|
||||
*/
|
||||
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
|
||||
rq->nr_running == rq->cfs.h_nr_running)) {
|
||||
rq->nr_running == rq->cfs.h_nr_queued)) {
|
||||
|
||||
p = pick_next_task_fair(rq, prev, rf);
|
||||
if (unlikely(p == RETRY_TASK))
|
||||
@ -6641,7 +6645,6 @@ static void __sched notrace __schedule(int sched_mode)
|
||||
* as a preemption by schedule_debug() and RCU.
|
||||
*/
|
||||
bool preempt = sched_mode > SM_NONE;
|
||||
bool block = false;
|
||||
unsigned long *switch_count;
|
||||
unsigned long prev_state;
|
||||
struct rq_flags rf;
|
||||
@ -6702,7 +6705,7 @@ static void __sched notrace __schedule(int sched_mode)
|
||||
goto picked;
|
||||
}
|
||||
} else if (!preempt && prev_state) {
|
||||
block = try_to_block_task(rq, prev, prev_state);
|
||||
try_to_block_task(rq, prev, prev_state);
|
||||
switch_count = &prev->nvcsw;
|
||||
}
|
||||
|
||||
@ -6748,7 +6751,8 @@ static void __sched notrace __schedule(int sched_mode)
|
||||
|
||||
migrate_disable_switch(rq, prev);
|
||||
psi_account_irqtime(rq, prev, next);
|
||||
psi_sched_switch(prev, next, block);
|
||||
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
|
||||
prev->se.sched_delayed);
|
||||
|
||||
trace_sched_switch(preempt, prev, next, prev_state);
|
||||
|
||||
@ -8180,19 +8184,14 @@ static void cpuset_cpu_active(void)
|
||||
cpuset_update_active_cpus();
|
||||
}
|
||||
|
||||
static int cpuset_cpu_inactive(unsigned int cpu)
|
||||
static void cpuset_cpu_inactive(unsigned int cpu)
|
||||
{
|
||||
if (!cpuhp_tasks_frozen) {
|
||||
int ret = dl_bw_check_overflow(cpu);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
cpuset_update_active_cpus();
|
||||
} else {
|
||||
num_cpus_frozen++;
|
||||
partition_sched_domains(1, NULL, NULL);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void sched_smt_present_inc(int cpu)
|
||||
@ -8254,6 +8253,11 @@ int sched_cpu_deactivate(unsigned int cpu)
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
int ret;
|
||||
|
||||
ret = dl_bw_deactivate(cpu);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
|
||||
* load balancing when not active
|
||||
@ -8299,15 +8303,7 @@ int sched_cpu_deactivate(unsigned int cpu)
|
||||
return 0;
|
||||
|
||||
sched_update_numa(cpu, false);
|
||||
ret = cpuset_cpu_inactive(cpu);
|
||||
if (ret) {
|
||||
sched_smt_present_inc(cpu);
|
||||
sched_set_rq_online(rq, cpu);
|
||||
balance_push_set(cpu, false);
|
||||
set_cpu_active(cpu, true);
|
||||
sched_update_numa(cpu, true);
|
||||
return ret;
|
||||
}
|
||||
cpuset_cpu_inactive(cpu);
|
||||
sched_domains_numa_masks_clear(cpu);
|
||||
return 0;
|
||||
}
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
|
||||
|
||||
/*
|
||||
* There are no locks covering percpu hardirq/softirq time.
|
||||
* They are only modified in vtime_account, on corresponding CPU
|
||||
@ -22,16 +24,14 @@
|
||||
*/
|
||||
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
|
||||
|
||||
static int sched_clock_irqtime;
|
||||
|
||||
void enable_sched_clock_irqtime(void)
|
||||
{
|
||||
sched_clock_irqtime = 1;
|
||||
static_branch_enable(&sched_clock_irqtime);
|
||||
}
|
||||
|
||||
void disable_sched_clock_irqtime(void)
|
||||
{
|
||||
sched_clock_irqtime = 0;
|
||||
static_branch_disable(&sched_clock_irqtime);
|
||||
}
|
||||
|
||||
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
|
||||
@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
|
||||
s64 delta;
|
||||
int cpu;
|
||||
|
||||
if (!sched_clock_irqtime)
|
||||
if (!irqtime_enabled())
|
||||
return;
|
||||
|
||||
cpu = smp_processor_id();
|
||||
@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
|
||||
|
||||
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
|
||||
#define sched_clock_irqtime (0)
|
||||
|
||||
static u64 irqtime_tick_accounted(u64 dummy)
|
||||
{
|
||||
return 0;
|
||||
@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
|
||||
if (vtime_accounting_enabled_this_cpu())
|
||||
return;
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
if (irqtime_enabled()) {
|
||||
irqtime_account_process_tick(p, user_tick, 1);
|
||||
return;
|
||||
}
|
||||
@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
|
||||
{
|
||||
u64 cputime, steal;
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
if (irqtime_enabled()) {
|
||||
irqtime_account_idle_ticks(ticks);
|
||||
return;
|
||||
}
|
||||
|
@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s
|
||||
__add_rq_bw(new_bw, &rq->dl);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
|
||||
{
|
||||
/*
|
||||
* If the timer callback was running (hrtimer_try_to_cancel == -1),
|
||||
* it will eventually call put_task_struct().
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
|
||||
put_task_struct(dl_task_of(dl_se));
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void cancel_replenish_timer(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
cancel_dl_timer(dl_se, &dl_se->dl_timer);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void cancel_inactive_timer(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
cancel_dl_timer(dl_se, &dl_se->inactive_timer);
|
||||
}
|
||||
|
||||
static void dl_change_utilization(struct task_struct *p, u64 new_bw)
|
||||
{
|
||||
WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
|
||||
@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
|
||||
if (!dl_server(dl_se))
|
||||
put_task_struct(dl_task_of(dl_se));
|
||||
}
|
||||
cancel_inactive_timer(dl_se);
|
||||
} else {
|
||||
/*
|
||||
* Since "dl_non_contending" is not set, the
|
||||
@ -2115,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
|
||||
* The replenish timer needs to be canceled. No
|
||||
* problem if it fires concurrently: boosted threads
|
||||
* are ignored in dl_task_timer().
|
||||
*
|
||||
* If the timer callback was running (hrtimer_try_to_cancel == -1),
|
||||
* it will eventually call put_task_struct().
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
|
||||
!dl_server(&p->dl))
|
||||
put_task_struct(p);
|
||||
cancel_replenish_timer(&p->dl);
|
||||
p->dl.dl_throttled = 0;
|
||||
}
|
||||
} else if (!dl_prio(p->normal_prio)) {
|
||||
@ -2289,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
|
||||
put_task_struct(p);
|
||||
cancel_inactive_timer(&p->dl);
|
||||
}
|
||||
sub_rq_bw(&p->dl, &rq->dl);
|
||||
rq_unlock(rq, &rf);
|
||||
@ -2506,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
|
||||
return NULL;
|
||||
|
||||
next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
|
||||
|
||||
next_node:
|
||||
if (next_node) {
|
||||
while (next_node) {
|
||||
p = __node_2_pdl(next_node);
|
||||
|
||||
if (task_is_pushable(rq, p, cpu))
|
||||
return p;
|
||||
|
||||
next_node = rb_next(next_node);
|
||||
goto next_node;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
@ -2964,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p)
|
||||
|
||||
void dl_clear_root_domain(struct root_domain *rd)
|
||||
{
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
|
||||
guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
|
||||
rd->dl_bw.total_bw = 0;
|
||||
raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
|
||||
|
||||
/*
|
||||
* dl_server bandwidth is only restored when CPUs are attached to root
|
||||
* domains (after domains are created or CPUs moved back to the
|
||||
* default root doamin).
|
||||
*/
|
||||
for_each_cpu(i, rd->span) {
|
||||
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
|
||||
|
||||
if (dl_server(dl_se) && cpu_active(i))
|
||||
rd->dl_bw.total_bw += dl_se->dl_bw;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
@ -3029,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
|
||||
*/
|
||||
static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
|
||||
put_task_struct(p);
|
||||
cancel_inactive_timer(&p->dl);
|
||||
|
||||
/*
|
||||
* In case a task is setscheduled to SCHED_DEADLINE we need to keep
|
||||
@ -3453,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
|
||||
}
|
||||
|
||||
enum dl_bw_request {
|
||||
dl_bw_req_check_overflow = 0,
|
||||
dl_bw_req_deactivate = 0,
|
||||
dl_bw_req_alloc,
|
||||
dl_bw_req_free
|
||||
};
|
||||
|
||||
static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long flags, cap;
|
||||
struct dl_bw *dl_b;
|
||||
bool overflow = 0;
|
||||
u64 fair_server_bw = 0;
|
||||
|
||||
rcu_read_lock_sched();
|
||||
dl_b = dl_bw_of(cpu);
|
||||
raw_spin_lock_irqsave(&dl_b->lock, flags);
|
||||
|
||||
if (req == dl_bw_req_free) {
|
||||
cap = dl_bw_capacity(cpu);
|
||||
switch (req) {
|
||||
case dl_bw_req_free:
|
||||
__dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
|
||||
} else {
|
||||
unsigned long cap = dl_bw_capacity(cpu);
|
||||
|
||||
break;
|
||||
case dl_bw_req_alloc:
|
||||
overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
|
||||
|
||||
if (req == dl_bw_req_alloc && !overflow) {
|
||||
if (!overflow) {
|
||||
/*
|
||||
* We reserve space in the destination
|
||||
* root_domain, as we can't fail after this point.
|
||||
@ -3484,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
|
||||
*/
|
||||
__dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
|
||||
}
|
||||
break;
|
||||
case dl_bw_req_deactivate:
|
||||
/*
|
||||
* cpu is not off yet, but we need to do the math by
|
||||
* considering it off already (i.e., what would happen if we
|
||||
* turn cpu off?).
|
||||
*/
|
||||
cap -= arch_scale_cpu_capacity(cpu);
|
||||
|
||||
/*
|
||||
* cpu is going offline and NORMAL tasks will be moved away
|
||||
* from it. We can thus discount dl_server bandwidth
|
||||
* contribution as it won't need to be servicing tasks after
|
||||
* the cpu is off.
|
||||
*/
|
||||
if (cpu_rq(cpu)->fair_server.dl_server)
|
||||
fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
|
||||
|
||||
/*
|
||||
* Not much to check if no DEADLINE bandwidth is present.
|
||||
* dl_servers we can discount, as tasks will be moved out the
|
||||
* offlined CPUs anyway.
|
||||
*/
|
||||
if (dl_b->total_bw - fair_server_bw > 0) {
|
||||
/*
|
||||
* Leaving at least one CPU for DEADLINE tasks seems a
|
||||
* wise thing to do. As said above, cpu is not offline
|
||||
* yet, so account for that.
|
||||
*/
|
||||
if (dl_bw_cpus(cpu) - 1)
|
||||
overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
|
||||
else
|
||||
overflow = 1;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
|
||||
@ -3492,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
|
||||
return overflow ? -EBUSY : 0;
|
||||
}
|
||||
|
||||
int dl_bw_check_overflow(int cpu)
|
||||
int dl_bw_deactivate(int cpu)
|
||||
{
|
||||
return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
|
||||
return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
|
||||
}
|
||||
|
||||
int dl_bw_alloc(int cpu, u64 dl_bw)
|
||||
|
@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (rq->cfs.h_nr_running) {
|
||||
if (rq->cfs.h_nr_queued) {
|
||||
update_rq_clock(rq);
|
||||
dl_server_stop(&rq->fair_server);
|
||||
}
|
||||
@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
|
||||
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
|
||||
cpu_of(rq));
|
||||
|
||||
if (rq->cfs.h_nr_running)
|
||||
if (rq->cfs.h_nr_queued)
|
||||
dl_server_start(&rq->fair_server);
|
||||
}
|
||||
|
||||
@ -843,13 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
SPLIT_NS(right_vruntime));
|
||||
spread = right_vruntime - left_vruntime;
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
|
||||
cfs_rq->idle_nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
|
||||
cfs_rq->idle_h_nr_running);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
|
||||
@ -1295,8 +1292,10 @@ void resched_latency_warn(int cpu, u64 latency)
|
||||
{
|
||||
static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
|
||||
|
||||
WARN(__ratelimit(&latency_check_ratelimit),
|
||||
"sched: CPU %d need_resched set for > %llu ns (%d ticks) "
|
||||
"without schedule\n",
|
||||
cpu, latency, cpu_rq(cpu)->ticks_without_resched);
|
||||
if (likely(!__ratelimit(&latency_check_ratelimit)))
|
||||
return;
|
||||
|
||||
pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n",
|
||||
cpu, latency, cpu_rq(cpu)->ticks_without_resched);
|
||||
dump_stack();
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -31,6 +31,15 @@ SCHED_FEAT(PREEMPT_SHORT, true)
|
||||
*/
|
||||
SCHED_FEAT(NEXT_BUDDY, false)
|
||||
|
||||
/*
|
||||
* Allow completely ignoring cfs_rq->next; which can be set from various
|
||||
* places:
|
||||
* - NEXT_BUDDY (wakeup preemption)
|
||||
* - yield_to_task()
|
||||
* - cgroup dequeue / pick
|
||||
*/
|
||||
SCHED_FEAT(PICK_BUDDY, true)
|
||||
|
||||
/*
|
||||
* Consider buddies to be cache hot, decreases the likeliness of a
|
||||
* cache buddy being migrated away, increases cache locality.
|
||||
|
@ -9,15 +9,9 @@
|
||||
*/
|
||||
|
||||
enum hk_flags {
|
||||
HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
|
||||
HK_FLAG_RCU = BIT(HK_TYPE_RCU),
|
||||
HK_FLAG_MISC = BIT(HK_TYPE_MISC),
|
||||
HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
|
||||
HK_FLAG_TICK = BIT(HK_TYPE_TICK),
|
||||
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
|
||||
HK_FLAG_WQ = BIT(HK_TYPE_WQ),
|
||||
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
|
||||
HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
|
||||
HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
|
||||
};
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
|
||||
@ -97,7 +91,7 @@ void __init housekeeping_init(void)
|
||||
|
||||
static_branch_enable(&housekeeping_overridden);
|
||||
|
||||
if (housekeeping.flags & HK_FLAG_TICK)
|
||||
if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
|
||||
sched_tick_offload_init();
|
||||
|
||||
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
|
||||
@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
|
||||
unsigned int first_cpu;
|
||||
int err = 0;
|
||||
|
||||
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
|
||||
if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
|
||||
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
|
||||
pr_warn("Housekeeping: nohz unsupported."
|
||||
" Build with CONFIG_NO_HZ_FULL\n");
|
||||
@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
|
||||
housekeeping_setup_type(type, housekeeping_staging);
|
||||
}
|
||||
|
||||
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
|
||||
if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
|
||||
tick_nohz_full_setup(non_housekeeping_mask);
|
||||
|
||||
housekeeping.flags |= flags;
|
||||
@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
|
||||
HK_FLAG_MISC | HK_FLAG_KTHREAD;
|
||||
flags = HK_FLAG_KERNEL_NOISE;
|
||||
|
||||
return housekeeping_setup(str, flags);
|
||||
}
|
||||
@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
|
||||
int len;
|
||||
|
||||
while (isalpha(*str)) {
|
||||
/*
|
||||
* isolcpus=nohz is equivalent to nohz_full.
|
||||
*/
|
||||
if (!strncmp(str, "nohz,", 5)) {
|
||||
str += 5;
|
||||
flags |= HK_FLAG_TICK;
|
||||
flags |= HK_FLAG_KERNEL_NOISE;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
|
||||
*
|
||||
* group: [ see update_cfs_group() ]
|
||||
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
|
||||
* se_runnable() = grq->h_nr_running
|
||||
* se_runnable() = grq->h_nr_runnable
|
||||
*
|
||||
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
|
||||
* runnable_avg = runnable_sum
|
||||
@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (___update_load_sum(now, &cfs_rq->avg,
|
||||
scale_load_down(cfs_rq->load.weight),
|
||||
cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
|
||||
cfs_rq->h_nr_runnable,
|
||||
cfs_rq->curr != NULL)) {
|
||||
|
||||
___update_load_avg(&cfs_rq->avg, 1);
|
||||
|
@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
|
||||
s64 delta;
|
||||
u64 irq;
|
||||
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
|
||||
return;
|
||||
|
||||
if (!curr->pid)
|
||||
@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
if (!irqtime_enabled() && res == PSI_IRQ)
|
||||
return -EOPNOTSUPP;
|
||||
#endif
|
||||
|
||||
/* Update averages before reporting them */
|
||||
mutex_lock(&group->avgs_lock);
|
||||
now = sched_clock();
|
||||
|
@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
|
||||
extern bool __checkparam_dl(const struct sched_attr *attr);
|
||||
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
|
||||
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
|
||||
extern int dl_bw_check_overflow(int cpu);
|
||||
extern int dl_bw_deactivate(int cpu);
|
||||
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
|
||||
/*
|
||||
* SCHED_DEADLINE supports servers (nested scheduling) with the following
|
||||
@ -650,11 +650,10 @@ struct balance_callback {
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
unsigned int nr_running;
|
||||
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int idle_nr_running; /* SCHED_IDLE */
|
||||
unsigned int idle_h_nr_running; /* SCHED_IDLE */
|
||||
unsigned int h_nr_delayed;
|
||||
unsigned int nr_queued;
|
||||
unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
|
||||
unsigned int h_nr_idle; /* SCHED_IDLE */
|
||||
|
||||
s64 avg_vruntime;
|
||||
u64 avg_load;
|
||||
@ -904,11 +903,8 @@ struct dl_rq {
|
||||
|
||||
static inline void se_update_runnable(struct sched_entity *se)
|
||||
{
|
||||
if (!entity_is_task(se)) {
|
||||
struct cfs_rq *cfs_rq = se->my_q;
|
||||
|
||||
se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
|
||||
}
|
||||
if (!entity_is_task(se))
|
||||
se->runnable_weight = se->my_q->h_nr_runnable;
|
||||
}
|
||||
|
||||
static inline long se_runnable(struct sched_entity *se)
|
||||
@ -2280,7 +2276,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
|
||||
|
||||
static inline int task_on_rq_queued(struct task_struct *p)
|
||||
{
|
||||
return p->on_rq == TASK_ON_RQ_QUEUED;
|
||||
return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
|
||||
}
|
||||
|
||||
static inline int task_on_rq_migrating(struct task_struct *p)
|
||||
@ -2574,7 +2570,7 @@ static inline bool sched_rt_runnable(struct rq *rq)
|
||||
|
||||
static inline bool sched_fair_runnable(struct rq *rq)
|
||||
{
|
||||
return rq->cfs.nr_running > 0;
|
||||
return rq->cfs.nr_queued > 0;
|
||||
}
|
||||
|
||||
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
@ -3242,6 +3238,12 @@ struct irqtime {
|
||||
};
|
||||
|
||||
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
|
||||
DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
|
||||
|
||||
static inline int irqtime_enabled(void)
|
||||
{
|
||||
return static_branch_likely(&sched_clock_irqtime);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the irqtime minus the softirq time computed by ksoftirqd.
|
||||
@ -3262,6 +3264,13 @@ static inline u64 irq_time_read(int cpu)
|
||||
return total;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline int irqtime_enabled(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ
|
||||
@ -3509,6 +3518,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
|
||||
|
||||
#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
|
||||
|
||||
extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr);
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
|
||||
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
|
||||
|
@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
|
||||
* Bump this up when changing the output format or the meaning of an existing
|
||||
* format, so that tools can adapt (or abort)
|
||||
*/
|
||||
#define SCHEDSTAT_VERSION 16
|
||||
#define SCHEDSTAT_VERSION 17
|
||||
|
||||
static int show_schedstat(struct seq_file *seq, void *v)
|
||||
{
|
||||
@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
|
||||
for_each_domain(cpu, sd) {
|
||||
enum cpu_idle_type itype;
|
||||
|
||||
seq_printf(seq, "domain%d %*pb", dcount++,
|
||||
seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
|
||||
cpumask_pr_args(sched_domain_span(sd)));
|
||||
for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
|
||||
seq_printf(seq, " %u %u %u %u %u %u %u %u",
|
||||
seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
|
||||
sd->lb_count[itype],
|
||||
sd->lb_balanced[itype],
|
||||
sd->lb_failed[itype],
|
||||
sd->lb_imbalance[itype],
|
||||
sd->lb_imbalance_load[itype],
|
||||
sd->lb_imbalance_util[itype],
|
||||
sd->lb_imbalance_task[itype],
|
||||
sd->lb_imbalance_misfit[itype],
|
||||
sd->lb_gained[itype],
|
||||
sd->lb_hot_gained[itype],
|
||||
sd->lb_nobusyq[itype],
|
||||
|
@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
|
||||
if (flags & ENQUEUE_RESTORE)
|
||||
return;
|
||||
|
||||
/* psi_sched_switch() will handle the flags */
|
||||
if (task_on_cpu(task_rq(p), p))
|
||||
return;
|
||||
|
||||
if (p->se.sched_delayed) {
|
||||
/* CPU migration of "sleeping" task */
|
||||
SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
|
||||
|
@ -300,20 +300,10 @@ static void __setscheduler_params(struct task_struct *p,
|
||||
|
||||
p->policy = policy;
|
||||
|
||||
if (dl_policy(policy)) {
|
||||
if (dl_policy(policy))
|
||||
__setparam_dl(p, attr);
|
||||
} else if (fair_policy(policy)) {
|
||||
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
|
||||
if (attr->sched_runtime) {
|
||||
p->se.custom_slice = 1;
|
||||
p->se.slice = clamp_t(u64, attr->sched_runtime,
|
||||
NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
|
||||
NSEC_PER_MSEC*100); /* HZ=100 / 10 */
|
||||
} else {
|
||||
p->se.custom_slice = 0;
|
||||
p->se.slice = sysctl_sched_base_slice;
|
||||
}
|
||||
}
|
||||
else if (fair_policy(policy))
|
||||
__setparam_fair(p, attr);
|
||||
|
||||
/* rt-policy tasks do not have a timerslack */
|
||||
if (rt_or_dl_task_policy(p)) {
|
||||
@ -1433,7 +1423,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
|
||||
struct rq *rq, *p_rq;
|
||||
int yielded = 0;
|
||||
|
||||
scoped_guard (irqsave) {
|
||||
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
|
||||
rq = this_rq();
|
||||
|
||||
again:
|
||||
|
@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
.max_newidle_lb_cost = 0,
|
||||
.last_decay_max_lb_cost = jiffies,
|
||||
.child = child,
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
.name = tl->name,
|
||||
#endif
|
||||
};
|
||||
|
||||
sd_span = sched_domain_span(sd);
|
||||
@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
|
||||
if (!cpumask_subset(sched_domain_span(child),
|
||||
sched_domain_span(sd))) {
|
||||
pr_err("BUG: arch topology borken\n");
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
pr_err(" the %s domain not a subset of the %s domain\n",
|
||||
child->name, sd->name);
|
||||
#endif
|
||||
/* Fixup, ensure @sd has at least @child CPUs. */
|
||||
cpumask_or(sched_domain_span(sd),
|
||||
sched_domain_span(sd),
|
||||
@ -2721,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
|
||||
|
||||
/*
|
||||
* This domain won't be destroyed and as such
|
||||
* its dl_bw->total_bw needs to be cleared. It
|
||||
* will be recomputed in function
|
||||
* update_tasks_root_domain().
|
||||
* its dl_bw->total_bw needs to be cleared.
|
||||
* Tasks contribution will be then recomputed
|
||||
* in function dl_update_tasks_root_domain(),
|
||||
* dl_servers contribution in function
|
||||
* dl_restore_server_root_domain().
|
||||
*/
|
||||
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
|
||||
dl_clear_root_domain(rd);
|
||||
|
Loading…
Reference in New Issue
Block a user