From ff49781ee15dbb7a5d422ea591890eea12f8e7d6 Mon Sep 17 00:00:00 2001 From: FAROVITUS Date: Fri, 17 Aug 2018 17:12:53 +0200 Subject: [PATCH] sched: fix upstream conflicts/issues --- kernel/sched/core.c | 63 +++------- kernel/sched/deadline.c | 98 ++------------- kernel/sched/fair.c | 95 ++++----------- kernel/sched/rt.c | 258 ++++++++++++++++++++-------------------- kernel/sched/sched.h | 37 +++--- 5 files changed, 192 insertions(+), 359 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d6ba88186e76..7b563e77c1e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -649,10 +649,7 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (cpu == i) - continue; - - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } @@ -2767,7 +2764,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm_irqs_off(oldmm, mm, next); + switch_mm(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@ -5131,16 +5128,14 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: - * Also, reset softlockup watchdogs on all CPUs, because - * another CPU might be blocked waiting for us to process - * an IPI. */ touch_nmi_watchdog(); - touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } + touch_all_softlockup_watchdogs(); + #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif @@ -5708,6 +5703,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: @@ -6076,12 +6072,6 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; -#ifdef HAVE_RT_PUSH_IPI - rd->rto_cpu = -1; - raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); -#endif - init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_dlo_mask; @@ -6296,9 +6286,6 @@ enum s_alloc { * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. * - * Only CPUs that can arrive at this group should be considered to continue - * balancing. - * * Asymmetric node setups can result in situations where the domain tree is of * unequal depth, make sure to skip domains that already cover the entire * range. @@ -6310,31 +6297,18 @@ enum s_alloc { */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { - const struct cpumask *sg_span = sched_group_cpus(sg); + const struct cpumask *span = sched_domain_span(sd); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, sg_span) { + for_each_cpu(i, span) { sibling = *per_cpu_ptr(sdd->sd, i); - - /* - * Can happen in the asymmetric case, where these siblings are - * unused. The mask will not be empty because those CPUs that - * do have the top domain _should_ span the domain. - */ - if (!sibling->child) - continue; - - /* If we would not end up here, we can't continue from here */ - if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; cpumask_set_cpu(i, sched_group_mask(sg)); } - - /* We must not have empty masks here */ - WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); } /* @@ -7471,16 +7445,17 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); - if (--num_cpus_frozen) + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); break; + } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); case CPU_ONLINE: cpuset_update_active_cpus(true); @@ -8448,20 +8423,11 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -/* Expose task group only after completing cgroup initialization */ -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - - if (parent) - sched_online_group(tg, parent); - return 0; -} - static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -8836,7 +8802,6 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, - .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e12b0a4df891..6be2afd9bfd6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -480,84 +480,13 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * Revised wakeup rule [1]: For self-suspending tasks, rather then - * re-initializing task's runtime and deadline, the revised wakeup - * rule adjusts the task's runtime to avoid the task to overrun its - * density. + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. * - * Reasoning: a task may overrun the density if: - * runtime / (deadline - t) > dl_runtime / dl_deadline - * - * Therefore, runtime can be adjusted to: - * runtime = (dl_runtime / dl_deadline) * (deadline - t) - * - * In such way that runtime will be equal to the maximum density - * the task can use without breaking any rule. - * - * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant - * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. - */ -static void -update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) -{ - u64 laxity = dl_se->deadline - rq_clock(rq); - - /* - * If the task has deadline < period, and the deadline is in the past, - * it should already be throttled before this check. - * - * See update_dl_entity() comments for further details. - */ - WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); - - dl_se->runtime = (dl_se->dl_density * laxity) >> 20; -} - -/* - * Regarding the deadline, a task with implicit deadline has a relative - * deadline == relative period. A task with constrained deadline has a - * relative deadline <= relative period. - * - * We support constrained deadline tasks. However, there are some restrictions - * applied only for tasks which do not have an implicit deadline. See - * update_dl_entity() to know more about such restrictions. - * - * The dl_is_implicit() returns true if the task has an implicit deadline. - */ -static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline == dl_se->dl_period; -} - -/* - * When a deadline entity is placed in the runqueue, its runtime and deadline - * might need to be updated. This is done by a CBS wake up rule. There are two - * different rules: 1) the original CBS; and 2) the Revisited CBS. - * - * When the task is starting a new period, the Original CBS is used. In this - * case, the runtime is replenished and a new absolute deadline is set. - * - * When a task is queued before the begin of the next period, using the - * remaining runtime and deadline could make the entity to overflow, see - * dl_entity_overflow() to find more about runtime overflow. When such case - * is detected, the runtime and deadline need to be updated. - * - * If the task has an implicit deadline, i.e., deadline == period, the Original - * CBS is applied. the runtime is replenished and a new absolute deadline is - * set, as in the previous cases. - * - * However, the Original CBS does not work properly for tasks with - * deadline < period, which are said to have a constrained deadline. By - * applying the Original CBS, a constrained deadline task would be able to run - * runtime/deadline in a period. With deadline < period, the task would - * overrun the runtime/period allowed bandwidth, breaking the admission test. - * - * In order to prevent this misbehave, the Revisited CBS is used for - * constrained deadline tasks when a runtime overflow is detected. In the - * Revisited CBS, rather than replenishing & setting a new absolute deadline, - * the remaining runtime of the task is reduced to avoid runtime overflow. - * Please refer to the comments update_dl_revised_wakeup() function to find - * more about the Revised CBS rule. + * The policy here is that we update the deadline of the entity only if: + * - the current deadline is in the past, + * - using the remaining runtime with the current deadline would make + * the entity exceed its bandwidth. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -576,14 +505,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { - - if (unlikely(!dl_is_implicit(dl_se) && - !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ - update_dl_revised_wakeup(dl_se, rq); - return; - } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -1070,6 +991,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } +static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline < dl_se->dl_period; +} + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -1101,7 +1027,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) + if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) dl_check_constrained_dl(&p->dl); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad7f6e8ddf00..ae9bde73421a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -754,6 +754,8 @@ void post_init_entity_util_avg(struct sched_entity *se) } } +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { @@ -1261,6 +1263,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1328,30 +1332,20 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task or idle task. + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1435,7 +1429,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1461,16 +1454,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -2997,23 +2983,6 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { @@ -3033,8 +3002,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -4283,26 +4252,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4618,14 +4567,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) - set_next_buddy(se); + if (task_sleep && parent_entity(se)) + set_next_buddy(parent_entity(se)); + + /* avoid re-evaluating load for this entity */ + se = parent_entity(se); break; } flags |= DEQUEUE_SLEEP; @@ -4992,24 +4942,19 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = se->my_q; - long W, w = cfs_rq_load_avg(cfs_rq); + long w, W; - tg = cfs_rq->tg; + tg = se->my_q->tg; /* * W = @wg + \Sum rw_j */ - W = wg + atomic_long_read(&tg->load_avg); - - /* Ensure \Sum rw_j >= rw_i */ - W -= cfs_rq->tg_load_avg_contrib; - W += w; + W = wg + calc_tg_weight(tg, se->my_q); /* * w = rw_i + @wl */ - w += wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 801b4ec40702..d7aa990ae6a1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -64,6 +64,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) +static void push_irq_work_func(struct irq_work *work); +#endif + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -83,6 +87,13 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); +#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1793,172 +1804,160 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI - /* - * When a high priority task schedules out from a CPU and a lower priority - * task is scheduled in, a check is made to see if there's any RT tasks - * on other CPUs that are waiting to run because a higher priority RT task - * is currently running on its CPU. In this case, the CPU with multiple RT - * tasks queued on it (overloaded) needs to be notified that a CPU has opened - * up that may be able to run one of its non-running queued RT tasks. - * - * All CPUs with overloaded RT tasks need to be notified as there is currently - * no way to know which of these CPUs have the highest priority task waiting - * to run. Instead of trying to take a spinlock on each of these CPUs, - * which has shown to cause large latency when done on machines with many - * CPUs, sending an IPI to the CPUs to have them push off the overloaded - * RT tasks waiting to run. - * - * Just sending an IPI to each of the CPUs is also an issue, as on large - * count CPU machines, this can cause an IPI storm on a CPU, especially - * if its the only CPU with multiple RT tasks queued, and a large number - * of CPUs scheduling a lower priority task at the same time. - * - * Each root domain has its own irq work function that can iterate over - * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT - * tassk must be checked if there's one or many CPUs that are lowering - * their priority, there's a single irq work iterator that will try to - * push off RT tasks that are waiting to run. - * - * When a CPU schedules a lower priority task, it will kick off the - * irq work iterator that will jump to each CPU with overloaded RT tasks. - * As it only takes the first CPU that schedules a lower priority task - * to start the process, the rto_start variable is incremented and if - * the atomic result is one, then that CPU will try to take the rto_lock. - * This prevents high contention on the lock as the process handles all - * CPUs scheduling lower priority tasks. - * - * All CPUs that are scheduling a lower priority task will increment the - * rt_loop_next variable. This will make sure that the irq work iterator - * checks all RT overloaded CPUs whenever a CPU schedules a new lower - * priority task, even if the iterator is in the middle of a scan. Incrementing - * the rt_loop_next will cause the iterator to perform another scan. + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. */ -static int rto_next_cpu(struct root_domain *rd) +static int rto_next_cpu(struct rq *rq) { - int next; + int prev_cpu = rq->rt.push_cpu; int cpu; + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + /* - * When starting the IPI RT pushing, the rto_cpu is set to -1, - * rt_next_cpu() will simply return the first CPU found in - * the rto_mask. - * - * If rto_next_cpu() is called with rto_cpu is a valid cpu, it - * will return the next CPU found in the rto_mask. - * - * If there are no more CPUs left in the rto_mask, then a check is made - * against rto_loop and rto_loop_next. rto_loop is only updated with - * the rto_lock held, but any CPU may increment the rto_loop_next - * without any locking. + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. */ - for (;;) { - - /* When rto_cpu is -1 this acts like cpumask_first() */ - cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - - rd->rto_cpu = cpu; - - if (cpu < nr_cpu_ids) - return cpu; - - rd->rto_cpu = -1; + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + } else if (cpu >= nr_cpu_ids) { /* - * ACQUIRE ensures we see the @rto_mask changes - * made prior to the @next value observed. - * - * Matches WMB in rt_set_overload(). + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. */ - next = atomic_read_acquire(&rd->rto_loop_next); + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; + } + rq->rt.push_cpu = cpu; - if (rd->rto_loop == next) + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; +} + +static int find_next_push_cpu(struct rq *rq) +{ + struct rq *next_rq; + int cpu; + + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) break; + next_rq = cpu_rq(cpu); - rd->rto_loop = next; + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; } - return -1; + return cpu; } -static inline bool rto_start_trylock(atomic_t *v) -{ - return !atomic_cmpxchg_acquire(v, 0, 1); -} - -static inline void rto_start_unlock(atomic_t *v) -{ - atomic_set_release(v, 0); -} +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 static void tell_cpu_to_push(struct rq *rq) { - int cpu = -1; + int cpu; - /* Keep the loop going if the IPI is currently active */ - atomic_inc(&rq->rd->rto_loop_next); + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } - /* Only one CPU can initiate a loop at a time */ - if (!rto_start_trylock(&rq->rd->rto_loop_start)) + /* When here, there's no IPI going around */ + + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) return; - raw_spin_lock(&rq->rd->rto_lock); + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; - /* - * The rto_cpu is updated under the lock, if it has a valid cpu - * then the IPI is still running and will continue due to the - * update to loop_next, and nothing needs to be done here. - * Otherwise it is finishing up and an ipi needs to be sent. - */ - if (rq->rd->rto_cpu < 0) - cpu = rto_next_cpu(rq->rd); - - raw_spin_unlock(&rq->rd->rto_lock); - - rto_start_unlock(&rq->rd->rto_loop_start); - - if (cpu >= 0) { - /* Make sure the rd does not get freed while pushing */ - sched_get_rd(rq->rd); - irq_work_queue_on(&rq->rd->rto_push_work, cpu); - } + irq_work_queue_on(&rq->rt.push_work, cpu); } /* Called from hardirq context */ -void rto_push_irq_work_func(struct irq_work *work) +static void try_to_push_tasks(void *arg) { - struct root_domain *rd = - container_of(work, struct root_domain, rto_push_work); - struct rq *rq; + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; int cpu; - rq = this_rq(); + this_cpu = rt_rq->push_cpu; - /* - * We do not need to grab the lock to check for has_pushable_tasks. - * When it gets updated, a check is made if a push is possible. - */ + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_tasks(rq); + push_rt_task(rq); raw_spin_unlock(&rq->lock); } - raw_spin_lock(&rd->rto_lock); - /* Pass the IPI to the next rt overloaded queue */ - cpu = rto_next_cpu(rd); - - raw_spin_unlock(&rd->rto_lock); - - if (cpu < 0) { - sched_put_rd(rd); - return; + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; } + cpu = find_next_push_cpu(src_rq); + + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) + return; + + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rd->rto_push_work, cpu); + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); } #endif /* HAVE_RT_PUSH_IPI */ @@ -1968,9 +1967,8 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; - int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overload_count)) + if (likely(!rt_overloaded(this_rq))) return; /* @@ -1979,11 +1977,6 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); - /* If we are the only overloaded CPU do nothing */ - if (rt_overload_count == 1 && - cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) - return; - #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); @@ -2145,9 +2138,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); -#endif /* CONFIG_SMP */ +#else if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); +#endif /* CONFIG_SMP */ } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 609a25779ba1..a46e0ed39adb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -424,7 +424,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -457,6 +457,12 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; +#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -538,19 +544,6 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; -#ifdef HAVE_RT_PUSH_IPI - /* - * For IPI pull requests, loop across the rto_mask. - */ - struct irq_work rto_push_work; - raw_spinlock_t rto_lock; - /* These are only updated and read within rto_lock */ - int rto_loop; - int rto_cpu; - /* These atomics are updated outside of a lock */ - atomic_t rto_loop_next; - atomic_t rto_loop_start; -#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -563,9 +556,6 @@ extern struct root_domain def_root_domain; extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); -#ifdef HAVE_RT_PUSH_IPI -extern void rto_push_irq_work_func(struct irq_work *work); -#endif #endif /* CONFIG_SMP */ /* @@ -1808,3 +1798,16 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +}