In order of deadline scheduling to be effective and useful, it is important that some method of having the allocation of the available CPU bandwidth to tasks and task groups under control. This is usually called "admission control" and if it is not performed at all, no guarantee can be given on the actual scheduling of the -deadline tasks. Since when RT-throttling has been introduced each task group have a bandwidth associated to itself, calculated as a certain amount of runtime over a period. Moreover, to make it possible to manipulate such bandwidth, readable/writable controls have been added to both procfs (for system wide settings) and cgroupfs (for per-group settings). Therefore, the same interface is being used for controlling the bandwidth distrubution to -deadline tasks and task groups, i.e., new controls but with similar names, equivalent meaning and with the same usage paradigm are added. However, more discussion is needed in order to figure out how we want to manage SCHED_DEADLINE bandwidth at the task group level. Therefore, this patch adds a less sophisticated, but actually very sensible, mechanism to ensure that a certain utilization cap is not overcome per each root_domain (the single rq for !SMP configurations). Another main difference between deadline bandwidth management and RT-throttling is that -deadline tasks have bandwidth on their own (while -rt ones doesn't!), and thus we don't need an higher level throttling mechanism to enforce the desired bandwidth. This patch, therefore: - adds system wide deadline bandwidth management by means of: * /proc/sys/kernel/sched_dl_runtime_us, * /proc/sys/kernel/sched_dl_period_us, that determine (i.e., runtime / period) the total bandwidth available on each CPU of each root_domain for -deadline tasks; - couples the RT and deadline bandwidth management, i.e., enforces that the sum of how much bandwidth is being devoted to -rt -deadline tasks to stay below 100%. This means that, for a root_domain comprising M CPUs, -deadline tasks can be created until the sum of their bandwidths stay below: M * (sched_dl_runtime_us / sched_dl_period_us) It is also possible to disable this bandwidth management logic, and be thus free of oversubscribing the system up to any arbitrary level. Signed-off-by: Dario Faggioli <raistlin@linux.it> Signed-off-by: Juri Lelli <juri.lelli@gmail.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1383831828-15501-12-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
116 lines
3.4 KiB
C
116 lines
3.4 KiB
C
#ifndef _SCHED_SYSCTL_H
|
|
#define _SCHED_SYSCTL_H
|
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK
|
|
extern int sysctl_hung_task_check_count;
|
|
extern unsigned int sysctl_hung_task_panic;
|
|
extern unsigned long sysctl_hung_task_timeout_secs;
|
|
extern unsigned long sysctl_hung_task_warnings;
|
|
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
|
void __user *buffer,
|
|
size_t *lenp, loff_t *ppos);
|
|
#else
|
|
/* Avoid need for ifdefs elsewhere in the code */
|
|
enum { sysctl_hung_task_timeout_secs = 0 };
|
|
#endif
|
|
|
|
/*
|
|
* Default maximum number of active map areas, this limits the number of vmas
|
|
* per mm struct. Users can overwrite this number by sysctl but there is a
|
|
* problem.
|
|
*
|
|
* When a program's coredump is generated as ELF format, a section is created
|
|
* per a vma. In ELF, the number of sections is represented in unsigned short.
|
|
* This means the number of sections should be smaller than 65535 at coredump.
|
|
* Because the kernel adds some informative sections to a image of program at
|
|
* generating coredump, we need some margin. The number of extra sections is
|
|
* 1-3 now and depends on arch. We use "5" as safe margin, here.
|
|
*/
|
|
#define MAPCOUNT_ELF_CORE_MARGIN (5)
|
|
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
|
|
|
|
extern int sysctl_max_map_count;
|
|
|
|
extern unsigned int sysctl_sched_latency;
|
|
extern unsigned int sysctl_sched_min_granularity;
|
|
extern unsigned int sysctl_sched_wakeup_granularity;
|
|
extern unsigned int sysctl_sched_child_runs_first;
|
|
|
|
enum sched_tunable_scaling {
|
|
SCHED_TUNABLESCALING_NONE,
|
|
SCHED_TUNABLESCALING_LOG,
|
|
SCHED_TUNABLESCALING_LINEAR,
|
|
SCHED_TUNABLESCALING_END,
|
|
};
|
|
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
|
|
|
|
extern unsigned int sysctl_numa_balancing_scan_delay;
|
|
extern unsigned int sysctl_numa_balancing_scan_period_min;
|
|
extern unsigned int sysctl_numa_balancing_scan_period_max;
|
|
extern unsigned int sysctl_numa_balancing_scan_size;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
extern unsigned int sysctl_sched_migration_cost;
|
|
extern unsigned int sysctl_sched_nr_migrate;
|
|
extern unsigned int sysctl_sched_time_avg;
|
|
extern unsigned int sysctl_timer_migration;
|
|
extern unsigned int sysctl_sched_shares_window;
|
|
|
|
int sched_proc_update_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *length,
|
|
loff_t *ppos);
|
|
#endif
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
static inline unsigned int get_sysctl_timer_migration(void)
|
|
{
|
|
return sysctl_timer_migration;
|
|
}
|
|
#else
|
|
static inline unsigned int get_sysctl_timer_migration(void)
|
|
{
|
|
return 1;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* control realtime throttling:
|
|
*
|
|
* /proc/sys/kernel/sched_rt_period_us
|
|
* /proc/sys/kernel/sched_rt_runtime_us
|
|
*/
|
|
extern unsigned int sysctl_sched_rt_period;
|
|
extern int sysctl_sched_rt_runtime;
|
|
|
|
/*
|
|
* control SCHED_DEADLINE reservations:
|
|
*
|
|
* /proc/sys/kernel/sched_dl_period_us
|
|
* /proc/sys/kernel/sched_dl_runtime_us
|
|
*/
|
|
extern unsigned int sysctl_sched_dl_period;
|
|
extern int sysctl_sched_dl_runtime;
|
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_AUTOGROUP
|
|
extern unsigned int sysctl_sched_autogroup_enabled;
|
|
#endif
|
|
|
|
extern int sched_rr_timeslice;
|
|
|
|
extern int sched_rr_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
extern int sched_rt_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
int sched_dl_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
#endif /* _SCHED_SYSCTL_H */
|