From 5eea299803384dbd2bcdf5c0b6a5f261ad24eecc Mon Sep 17 00:00:00 2001 From: Mekala Natarajan Date: Fri, 17 May 2013 00:55:23 -0700 Subject: [PATCH] sched: provide per cpu-cgroup option to notify on migrations On systems where CPUs may run asynchronously, task migrations between CPUs running at grossly different speeds can cause problems. This change provides a mechanism to notify a subsystem in the kernel if a task in a particular cgroup migrates to a different CPU. Other subsystems (such as cpufreq) may then register for this notifier to take appropriate action when such a task is migrated. The cgroup attribute to set for this behavior is "notify_on_migrate" . Change-Id: Ie1868249e53ef901b89c837fdc33b0ad0c0a4590 Signed-off-by: Steve Muckle Signed-off-by: Mekala Natarajan --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++++++++++--- kernel/sched/fair.c | 20 ++++++++++++++++++-- kernel/sched/rt.c | 20 +++++++++++++++++++- kernel/sched/sched.h | 12 +++++++++++- 5 files changed, 88 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 68d71c6d7f3..ff6bb0ff8c9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2725,6 +2725,8 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +extern struct atomic_notifier_head migration_notifier_head; + extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 535cc77210e..1cee48f07d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -87,6 +87,8 @@ #define CREATE_TRACE_POINTS #include +ATOMIC_NOTIFIER_HEAD(migration_notifier_head); + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -1589,15 +1591,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; - int cpu, success = 0; + int cpu, src_cpu, success = 0; smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); + src_cpu = task_cpu(p); + cpu = src_cpu; + if (!(p->state & state)) goto out; success = 1; /* we're going to change ->state */ - cpu = task_cpu(p); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; @@ -1634,7 +1638,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) { + if (src_cpu != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } @@ -1646,6 +1650,9 @@ stat: out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (src_cpu != cpu && task_notify_on_migrate(p)) + atomic_notifier_call_chain(&migration_notifier_head, + cpu, (void *)src_cpu); return success; } @@ -5068,6 +5075,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + bool moved = false; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) @@ -5094,12 +5102,16 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); enqueue_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); + moved = true; } done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); raw_spin_unlock(&p->pi_lock); + if (moved && task_notify_on_migrate(p)) + atomic_notifier_call_chain(&migration_notifier_head, + dest_cpu, (void *)src_cpu); return ret; } @@ -7731,6 +7743,24 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, sched_move_task(task); } +static u64 cpu_notify_on_migrate_read_u64(struct cgroup *cgrp, + struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return tg->notify_on_migrate; +} + +static int cpu_notify_on_migrate_write_u64(struct cgroup *cgrp, + struct cftype *cft, u64 notify) +{ + struct task_group *tg = cgroup_tg(cgrp); + + tg->notify_on_migrate = (notify > 0); + + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8002,6 +8032,11 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) #endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { + { + .name = "notify_on_migrate", + .read_u64 = cpu_notify_on_migrate_read_u64, + .write_u64 = cpu_notify_on_migrate_write_u64, + }, #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9553640c1c..fc60d5b3a25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3090,6 +3090,8 @@ struct lb_env { unsigned int loop_max; }; +static DEFINE_PER_CPU(bool, dbs_boost_needed); + /* * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. @@ -3100,6 +3102,8 @@ static void move_task(struct task_struct *p, struct lb_env *env) set_task_cpu(p, env->dst_cpu); activate_task(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0); + if (task_notify_on_migrate(p)) + per_cpu(dbs_boost_needed, env->dst_cpu) = true; } /* @@ -4530,9 +4534,15 @@ more_balance: */ sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; - + if (per_cpu(dbs_boost_needed, this_cpu)) { + per_cpu(dbs_boost_needed, this_cpu) = false; + atomic_notifier_call_chain(&migration_notifier_head, + this_cpu, + (void *)cpu_of(busiest)); + } + } if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; @@ -4687,6 +4697,12 @@ static int active_load_balance_cpu_stop(void *data) out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); + if (per_cpu(dbs_boost_needed, target_cpu)) { + per_cpu(dbs_boost_needed, target_cpu) = false; + atomic_notifier_call_chain(&migration_notifier_head, + target_cpu, + (void *)cpu_of(busiest_rq)); + } return 0; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8f32475d0ee..f8317dfed60 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1604,6 +1604,7 @@ static int push_rt_task(struct rq *rq) struct task_struct *next_task; struct rq *lowest_rq; int ret = 0; + bool moved = false; if (!rq->rt.overloaded) return 0; @@ -1673,6 +1674,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); + moved = true; activate_task(lowest_rq, next_task, 0); ret = 1; @@ -1683,6 +1685,11 @@ retry: out: put_task_struct(next_task); + if (moved && task_notify_on_migrate(next_task)) + atomic_notifier_call_chain(&migration_notifier_head, + cpu_of(lowest_rq), + (void *)cpu_of(rq)); + return ret; } @@ -1696,8 +1703,10 @@ static void push_rt_tasks(struct rq *rq) static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; - struct task_struct *p; + struct task_struct *p = NULL; struct rq *src_rq; + bool moved = false; + int src_cpu = 0; if (likely(!rt_overloaded(this_rq))) return 0; @@ -1758,6 +1767,10 @@ static int pull_rt_task(struct rq *this_rq) deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); + + moved = true; + src_cpu = cpu_of(src_rq); + /* * We continue with the search, just in * case there's an even higher prio task @@ -1769,6 +1782,11 @@ skip: double_unlock_balance(this_rq, src_rq); } + if (moved && task_notify_on_migrate(p)) + atomic_notifier_call_chain(&migration_notifier_head, + this_cpu, + (void *)src_cpu); + return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 451bd4f5166..5370bcb8e77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -104,6 +104,8 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; + bool notify_on_migrate; + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -554,6 +556,11 @@ static inline struct task_group *task_group(struct task_struct *p) return autogroup_task_group(p, tg); } +static inline bool task_notify_on_migrate(struct task_struct *p) +{ + return task_group(p)->notify_on_migrate; +} + /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { @@ -579,7 +586,10 @@ static inline struct task_group *task_group(struct task_struct *p) { return NULL; } - +static inline bool task_notify_on_migrate(struct task_struct *p) +{ + return false; +} #endif /* CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)