From a953e4597abd51b74c99e0e3b7074532a60fd031 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: sched: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c, where appropriate. This saves some allocated space as well as many wasted cycles going through node entries that are non-existent. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a..1ed8011 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6879,9 +6879,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) min_val = INT_MAX; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Start at @node */ - n = (node + i) % MAX_NUMNODES; + n = (node + i) % nr_node_ids; if (!nr_cpus_node(n)) continue; @@ -7075,7 +7075,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) if (!sched_group_nodes) continue; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; *nodemask = node_to_cpumask(i); @@ -7263,7 +7263,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -7407,7 +7407,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { SCHED_CPUMASK_VAR(nodemask, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7431,7 +7431,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, send_covered, tmpmask); } - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7470,9 +7470,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpus_or(*covered, *covered, *nodemask); prev = sg; - for (j = 0; j < MAX_NUMNODES; j++) { + for (j = 0; j < nr_node_ids; j++) { SCHED_CPUMASK_VAR(notcovered, allmasks); - int n = (i + j) % MAX_NUMNODES; + int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); cpus_complement(*notcovered, *covered); @@ -7525,7 +7525,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) + for (i = 0; i < nr_node_ids; i++) init_numa_sched_groups_power(sched_group_nodes[i]); if (sd_allnodes) { -- cgit v1.1 From 363ab6f1424cdea63e5d182312d60e19077b892a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: core: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 2 +- kernel/rcuclassic.c | 2 +- kernel/rcupreempt.c | 10 +++++----- kernel/sched.c | 36 ++++++++++++++++++------------------ kernel/sched_fair.c | 2 +- kernel/sched_rt.c | 6 +++--- kernel/taskstats.c | 4 ++-- kernel/workqueue.c | 6 +++--- 8 files changed, 34 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a..50ae922 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -390,7 +390,7 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask(cpu, frozen_cpus) { + for_each_cpu_mask_nr(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0..251358d 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -92,7 +92,7 @@ static void force_quiescent_state(struct rcu_data *rdp, */ cpumask = rcp->cpumask; cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask(cpu, cpumask) + for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } } diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf19..18af270 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -657,7 +657,7 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) { + for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; dyntick_save_progress_counter(cpu); } @@ -675,7 +675,7 @@ rcu_try_flip_waitack(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) if (rcu_try_flip_waitack_needed(cpu) && per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); @@ -707,7 +707,7 @@ rcu_try_flip_waitzero(void) /* Check to see if the sum of the "last" counters is zero. */ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; if (sum != 0) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); @@ -722,7 +722,7 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) { + for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; dyntick_save_progress_counter(cpu); } @@ -742,7 +742,7 @@ rcu_try_flip_waitmb(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) if (rcu_try_flip_waitmb_needed(cpu) && per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); diff --git a/kernel/sched.c b/kernel/sched.c index 1ed8011..814d6e1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2271,7 +2271,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2313,7 +2313,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, /* Traverse only the allowed CPUs */ cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, *tmp) { + for_each_cpu_mask_nr(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -3296,7 +3296,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { struct rq *rq; if (!cpu_isset(i, *cpus)) @@ -3560,7 +3560,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu_mask(i, group->cpumask) { + for_each_cpu_mask_nr(i, group->cpumask) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -4100,7 +4100,7 @@ static void run_rebalance_domains(struct softirq_action *h) int balance_cpu; cpu_clear(this_cpu, cpus); - for_each_cpu_mask(balance_cpu, cpus) { + for_each_cpu_mask_nr(balance_cpu, cpus) { /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -6832,7 +6832,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(*covered); - for_each_cpu_mask(i, *span) { + for_each_cpu_mask_nr(i, *span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; @@ -6843,7 +6843,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, *span) { + for_each_cpu_mask_nr(j, *span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; @@ -7043,7 +7043,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask(j, sg->cpumask) { + for_each_cpu_mask_nr(j, sg->cpumask) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); @@ -7068,7 +7068,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; - for_each_cpu_mask(cpu, *cpu_map) { + for_each_cpu_mask_nr(cpu, *cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@ -7302,7 +7302,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = NULL, *p; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7374,7 +7374,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { SCHED_CPUMASK_VAR(this_sibling_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7391,7 +7391,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7458,7 +7458,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, *nodemask) { + for_each_cpu_mask_nr(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); @@ -7504,21 +7504,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); @@ -7538,7 +7538,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Attach the domains */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask_nr(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -7621,7 +7621,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) unregister_sched_domain_sysctl(); - for_each_cpu_mask(i, *cpu_map) + for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); arch_destroy_sched_domains(cpu_map, &tmpmask); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd3..e398318 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1022,7 +1022,7 @@ static int wake_idle(int cpu, struct task_struct *p) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask_nr(i, tmp) { if (idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 060e87b..d73386c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -231,7 +231,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu_mask_nr(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@ -272,7 +272,7 @@ static int balance_runtime(struct rt_rq *rt_rq) spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask(i, rd->span) { + for_each_cpu_mask_nr(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -1000,7 +1000,7 @@ static int pull_rt_task(struct rq *this_rq) next = pick_next_task_rt(this_rq); - for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { + for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4a23517..06b1754 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) return -EINVAL; if (isadd == REGISTER) { - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); if (!s) @@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) /* Deregister or cleanup */ cleanup: - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29fc39f..28c2b2c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -397,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq) might_sleep(); lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); lock_release(&wq->lockdep_map, 1, _THIS_IP_); - for_each_cpu_mask(cpu, *cpu_map) + for_each_cpu_mask_nr(cpu, *cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -477,7 +477,7 @@ static void wait_on_work(struct work_struct *work) wq = cwq->wq; cpu_map = wq_cpu_map(wq); - for_each_cpu_mask(cpu, *cpu_map) + for_each_cpu_mask_nr(cpu, *cpu_map) wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } @@ -813,7 +813,7 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del(&wq->list); spin_unlock(&workqueue_lock); - for_each_cpu_mask(cpu, *cpu_map) + for_each_cpu_mask_nr(cpu, *cpu_map) cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); put_online_cpus(); -- cgit v1.1 From cad0e458d17c643c20c1d38f45a1d26125e6a622 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: clocksource/events: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 4 ++-- kernel/time/tick-broadcast.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dadde53..60ceabd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data) * Cycle through CPUs to check if the CPUs stay * synchronized to each other. */ - int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); + int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); - if (next_cpu >= NR_CPUS) + if (next_cpu >= nr_cpu_ids) next_cpu = first_cpu(cpu_online_map); watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 57a1f02..2d0a963 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -397,8 +397,7 @@ again: mask = CPU_MASK_NONE; now = ktime_get(); /* Find all expired events */ - for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; - cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) cpu_set(cpu, mask); -- cgit v1.1 From 9c44bc03fff44ff04237a7d92e35304a0e50c331 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:04 +0200 Subject: softlockup: allow panic on lockup allow users to configure the softlockup detector to generate a panic instead of a warning message. high-availability systems might opt for this strict method (combined with panic_timeout= boot option/sysctl), instead of generating softlockup warnings ad infinitum. also, automated tests work better if the system reboots reliably (into a safe kernel) in case of a lockup. The full spectrum of configurability is supported: boot option, sysctl option and Kconfig option. it's default-disabled. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/softlockup.c | 21 +++++++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 2 files changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 01b6522..78e0ad2 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -27,6 +27,21 @@ static DEFINE_PER_CPU(struct task_struct *, watchdog_task); static int __read_mostly did_panic; unsigned long __read_mostly softlockup_thresh = 60; +/* + * Should we panic (and reboot, if panic_timeout= is set) when a + * soft-lockup occurs: + */ +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + static int softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) { @@ -120,6 +135,9 @@ void softlockup_tick(void) else dump_stack(); spin_unlock(&print_lock); + + if (softlockup_panic) + panic("softlockup: hung tasks"); } /* @@ -172,6 +190,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now) t->last_switch_timestamp = now; touch_nmi_watchdog(); + + if (softlockup_panic) + panic("softlockup: blocked tasks"); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2911665..2d3b388 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -729,6 +729,17 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_DETECT_SOFTLOCKUP { .ctl_name = CTL_UNNUMBERED, + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_thresh", .data = &softlockup_thresh, .maxlen = sizeof(unsigned long), -- cgit v1.1 From 9383d9679056e6cc4e7ff70f31da945a268238f4 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Mon, 12 May 2008 21:21:14 +0200 Subject: softlockup: fix softlockup_thresh unaligned access and disable detection at runtime Fix unaligned access errors when setting softlockup_thresh on 64 bit platforms. Allow softlockup detection to be disabled by setting softlockup_thresh <= 0. Detect that boot time softlockup detection has been disabled earlier in softlockup_tick. Signed-off-by: Dimitri Sivanich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/softlockup.c | 12 ++++++++++-- kernel/sysctl.c | 9 +++++---- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 78e0ad2..a3a0b23 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp); static DEFINE_PER_CPU(struct task_struct *, watchdog_task); static int __read_mostly did_panic; -unsigned long __read_mostly softlockup_thresh = 60; +int __read_mostly softlockup_thresh = 60; /* * Should we panic (and reboot, if panic_timeout= is set) when a @@ -94,6 +94,14 @@ void softlockup_tick(void) struct pt_regs *regs = get_irq_regs(); unsigned long now; + /* Is detection switched off? */ + if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { + /* Be sure we don't false trigger if switched back on */ + if (touch_timestamp) + per_cpu(touch_timestamp, this_cpu) = 0; + return; + } + if (touch_timestamp == 0) { touch_softlockup_watchdog(); return; @@ -104,7 +112,7 @@ void softlockup_tick(void) /* report at most once a second */ if ((print_timestamp >= touch_timestamp && print_timestamp < (touch_timestamp + 1)) || - did_panic || !per_cpu(watchdog_task, this_cpu)) { + did_panic) { return; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d3b388..31c19a7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -84,12 +84,13 @@ extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; /* Constants used for minimum and maximum */ -#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) +#ifdef CONFIG_HIGHMEM static int one = 1; #endif #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; +static int neg_one = -1; #endif #ifdef CONFIG_MMU @@ -742,11 +743,11 @@ static struct ctl_table kern_table[] = { .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_thresh", .data = &softlockup_thresh, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &neg_one, .extra2 = &sixty, }, { -- cgit v1.1 From 1c4cd6dd1d0fd3057bb6b8c87460049497889d1b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:14 +0200 Subject: softlockup: fix softlockup_thresh fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 31c19a7..a829dc8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -84,7 +84,7 @@ extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; /* Constants used for minimum and maximum */ -#ifdef CONFIG_HIGHMEM +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) static int one = 1; #endif -- cgit v1.1 From 02ff375590ac4140d88afc76505df1ad45c6af59 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 15:43:53 +0200 Subject: softlockup: fix false positives on nohz if CPU is 100% idle for more than 60 seconds Fix (probably theoretical only) rq->clock update bug: in tick_nohz_update_jiffies() [which is called on all irq entry on all cpus where the irq entry hits an idle cpu] we call touch_softlockup_watchdog() before we update jiffies. That works fine most of the time when idle timeouts are within 60 seconds. But when an idle timeout is beyond 60 seconds, jiffies is updated with a jump of more than 60 seconds, which causes a jump in cpu-clock of more than 60 seconds, triggering a false positive. Reported-by: David Miller Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a89..28abad6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -133,8 +133,6 @@ void tick_nohz_update_jiffies(void) if (!ts->tick_stopped) return; - touch_softlockup_watchdog(); - cpu_clear(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@ -142,6 +140,8 @@ void tick_nohz_update_jiffies(void) local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); + + touch_softlockup_watchdog(); } void tick_nohz_stop_idle(int cpu) -- cgit v1.1 From 8c2238eaaf0f774ca0f8d9daad7a616429bbb7f1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 27 May 2008 12:23:29 -0500 Subject: softlockup: fix NMI hangs due to lock race - 2.6.26-rc regression The touch_nmi_watchdog() routine on x86 ultimately calls touch_softlockup_watchdog(). The problem is that to touch the softlockup watchdog, the cpu_clock code has to be called which could involve multiple cpu locks and can lead to a hard hang if one of the locks is held by a processor that is not going to return anytime soon (such as could be the case with kgdb or perhaps even with some other kind of exception). This patch causes the public version of the touch_softlockup_watchdog() to defer the cpu clock access to a later point. The test case for this problem is to use the following kernel config options: CONFIG_KGDB_TESTS=y CONFIG_KGDB_TESTS_ON_BOOT=y CONFIG_KGDB_TESTS_BOOT_STRING="V1F100I100000" It should be noted that kgdb test suite and these options were not available until 2.6.26-rc2, so it was necessary to patch the kgdb test suite during the bisection. I would consider this patch a regression fix because the problem first appeared in commit 27ec4407790d075c325e1f4da0a19c56953cce23 when some logic was added to try to periodically sync the clocks. It was possible to work around this particular problem by simply not performing the sync anytime the system was in a critical context. This was ok until commit 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd, which added config option CONFIG_HAVE_UNSTABLE_SCHED_CLOCK and some multi-cpu locks to sync the clocks. It became clear that accessing this code from an nmi was the source of the lockups. Avoiding the access to the low level clock code from an code inside the NMI processing also fixed the problem with the 27ec44... commit. Signed-off-by: Jason Wessel Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index a3a0b23..6b682d8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -64,12 +64,17 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -void touch_softlockup_watchdog(void) +static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); } + +void touch_softlockup_watchdog(void) +{ + __raw_get_cpu_var(touch_timestamp) = 0; +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -103,7 +108,7 @@ void softlockup_tick(void) } if (touch_timestamp == 0) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -118,7 +123,7 @@ void softlockup_tick(void) /* do not print during early bootup: */ if (unlikely(system_state != SYSTEM_RUNNING)) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -243,7 +248,7 @@ static int watchdog(void *__bind_cpu) sched_setscheduler(current, SCHED_FIFO, ¶m); /* initialize timestamp */ - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); set_current_state(TASK_INTERRUPTIBLE); /* @@ -252,7 +257,7 @@ static int watchdog(void *__bind_cpu) * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) -- cgit v1.1 From 688c91755dc3d3c03d8c67c1df13c02be258768e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 16 Jun 2008 15:51:08 -0700 Subject: softlockup: print a module list on being stuck Most places in the kernel that go BUG: print a module list (which is very useful for doing statistics and finding patterns), however the softlockup detector does not do this yet. This patch adds the one line change to fix this gap. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 6b682d8..f2bf5de 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -143,6 +143,7 @@ void softlockup_tick(void) printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", this_cpu, now - touch_timestamp, current->comm, task_pid_nr(current)); + print_modules(); if (regs) show_regs(regs); else -- cgit v1.1 From 8d5be7f4e8515af461cbc8f07687ccc81507d508 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Wed, 25 Jun 2008 08:50:10 +0200 Subject: softlockup: show irqtrace This patch adds some information about when interrupts were last enabled and disabled to the output of the softlockup detector. Signed-off-by: Vegard Nossum Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index f2bf5de..97977ec 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,7 @@ void softlockup_tick(void) this_cpu, now - touch_timestamp, current->comm, task_pid_nr(current)); print_modules(); + print_irqtrace_events(current); if (regs) show_regs(regs); else -- cgit v1.1 From dd7a1e5615b1719c0fdffee1ea5a7820ac8141a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 27 Jun 2008 15:07:21 +0200 Subject: softlockup: fix watchdog task wakeup frequency Updating the timestamp more often is pointless as we print the warnings only if we exceed the threshold. And the check for hung tasks relies on the last timestamp, so it will keep working correctly, too. Signed-off-by: Johannes Weiner Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 97977ec..d53ab70 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -130,8 +130,11 @@ void softlockup_tick(void) now = get_timestamp(this_cpu); - /* Wake up the high-prio watchdog task every second: */ - if (now > (touch_timestamp + 1)) + /* + * Wake up the high-prio watchdog task twice per + * threshold timespan. + */ + if (now > touch_timestamp + softlockup_thresh/2) wake_up_process(per_cpu(watchdog_task, this_cpu)); /* Warn about unreasonable delays: */ -- cgit v1.1 From 3e2f69fdd1b00166e7d589bce56b2d36a9e74374 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Jul 2008 09:12:04 +0200 Subject: softlockup: fix watchdog task wakeup frequency The print_timestamp can never be bigger than the touch_timestamp, at maximum it can be equal. And if it is, the second check for touch_timestamp + 1 bigger print_timestamp is always true, too. The check for equality is sufficient as we proceed in one-second-steps and are at least one second away from the last print-out if we have another timestamp. Signed-off-by: Johannes Weiner Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d53ab70..7bd8d1a 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -116,11 +116,8 @@ void softlockup_tick(void) print_timestamp = per_cpu(print_timestamp, this_cpu); /* report at most once a second */ - if ((print_timestamp >= touch_timestamp && - print_timestamp < (touch_timestamp + 1)) || - did_panic) { + if (print_timestamp == touch_timestamp || did_panic) return; - } /* do not print during early bootup: */ if (unlikely(system_state != SYSTEM_RUNNING)) { -- cgit v1.1 From d88c16919793a9f5dc93e7956da5bb089c7600b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 08:59:24 +0200 Subject: Revert parts of "ftrace: do not trace scheduler functions" the removal of -mno-spe in the !ftrace case was not intended. Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 985ddb7..15ab63f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +CFLAGS_REMOVE_sched.o = -mno-spe + ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg -- cgit v1.1 From 13b40c1e40f3261e83ee514a08b77dbecb93021b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 1 Jul 2008 10:32:50 -0700 Subject: sched: reduce stack size in isolated_cpu_setup() * Remove 16k stack requirements in isolated_cpu_setup when NR_CPUS=4096. Signed-off-by: Mike Travis Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 99e6d85..1ee18db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6768,7 +6768,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - int ints[NR_CPUS], i; + static int __initdata ints[NR_CPUS]; + int i; str = get_options(str, ARRAY_SIZE(ints), ints); cpus_clear(cpu_isolated_map); -- cgit v1.1 From 7ebefa8ceefed44cc321be70afc54a585a68ac0b Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Tue, 1 Jul 2008 23:32:15 +0200 Subject: sched: rework of "prioritize non-migratable tasks over migratable ones" (1) handle in a generic way all cases when a newly woken-up task is not migratable (not just a corner case when "rt_se->nr_cpus_allowed == 1") (2) if current is to be preempted, then make sure "p" will be picked up by pick_next_task_rt(). i.e. move task's group at the head of its list as well. currently, it's not a case for the group-scheduling case as described here: http://www.ussg.iu.edu/hypermail/linux/kernel/0807.0/0134.html Signed-off-by: Dmitry Adamushko Cc: Steven Rostedt Cc: Gregory Haskins Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 68 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 47ceac9..d3d1ccc 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -599,11 +599,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; - if (rt_se->nr_cpus_allowed == 1) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - + list_add_tail(&rt_se->run_list, queue); __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); @@ -688,32 +684,34 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. */ -static -void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +static void +requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) { - struct rt_prio_array *array = &rt_rq->active; - if (on_rt_rq(rt_se)) { - list_del_init(&rt_se->run_list); - list_add_tail(&rt_se->run_list, - array->queue + rt_se_prio(rt_se)); + struct rt_prio_array *array = &rt_rq->active; + struct list_head *queue = array->queue + rt_se_prio(rt_se); + + if (head) + list_move(&rt_se->run_list, queue); + else + list_move_tail(&rt_se->run_list, queue); } } -static void requeue_task_rt(struct rq *rq, struct task_struct *p) +static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) { struct sched_rt_entity *rt_se = &p->rt; struct rt_rq *rt_rq; for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); - requeue_rt_entity(rt_rq, rt_se); + requeue_rt_entity(rt_rq, rt_se, head); } } static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr); + requeue_task_rt(rq, rq->curr, 0); } #ifdef CONFIG_SMP @@ -753,6 +751,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync) */ return task_cpu(p); } + +static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) +{ + cpumask_t mask; + + if (rq->curr->rt.nr_cpus_allowed == 1) + return; + + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, &mask)) + return; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) + return; + + /* + * There appears to be other cpus that can accept + * current and none to run 'p', so lets reschedule + * to try and push current away: + */ + requeue_task_rt(rq, p, 1); + resched_task(rq->curr); +} + #endif /* CONFIG_SMP */ /* @@ -778,18 +800,8 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) * to move current somewhere else, making room for our non-migratable * task. */ - if((p->prio == rq->curr->prio) - && p->rt.nr_cpus_allowed == 1 - && rq->curr->rt.nr_cpus_allowed != 1) { - cpumask_t mask; - - if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - /* - * There appears to be other cpus that can accept - * current, so lets reschedule to try and push it away - */ - resched_task(rq->curr); - } + if (p->prio == rq->curr->prio && !need_resched()) + check_preempt_equal_prio(rq, p); #endif } @@ -1415,7 +1427,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) * on the queue: */ if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p); + requeue_task_rt(rq, p, 0); set_tsk_need_resched(p); } } -- cgit v1.1 From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 15 Jul 2008 04:43:49 -0700 Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2) This is based on Linus' idea of creating cpu_active_map that prevents scheduler load balancer from migrating tasks to the cpu that is going down. It allows us to simplify domain management code and avoid unecessary domain rebuilds during cpu hotplug event handling. Please ignore the cpusets part for now. It needs some more work in order to avoid crazy lock nesting. Although I did simplfy and unify domain reinitialization logic. We now simply call partition_sched_domains() in all the cases. This means that we're using exact same code paths as in cpusets case and hence the test below cover cpusets too. Cpuset changes to make rebuild_sched_domains() callable from various contexts are in the separate patch (right next after this one). This not only boots but also easily handles while true; do make clean; make -j 8; done and while true; do on-off-cpu 1; done at the same time. (on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing). Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing this on right now in gnome-terminal and things are moving just fine. Also this is running with most of the debug features enabled (lockdep, mutex, etc) no BUG_ONs or lockdep complaints so far. I believe I addressed all of the Dmitry's comments for original Linus' version. I changed both fair and rt balancer to mask out non-active cpus. And replaced cpu_is_offline() with !cpu_active() in the main scheduler code where it made sense (to me). Signed-off-by: Max Krasnyanskiy Acked-by: Linus Torvalds Acked-by: Peter Zijlstra Acked-by: Gregory Haskins Cc: dmitry.adamushko@gmail.com Cc: pj@sgi.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 30 ++++++++++++--- kernel/cpuset.c | 2 +- kernel/sched.c | 108 ++++++++++++++++++++++------------------------------ kernel/sched_fair.c | 3 ++ kernel/sched_rt.c | 7 ++++ 5 files changed, 80 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index cfb1d43..a1ac7ea 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void) cpu_hotplug.refcount = 0; } +cpumask_t cpu_active_map; + #ifdef CONFIG_HOTPLUG_CPU void get_online_cpus(void) @@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu) int err = 0; cpu_maps_update_begin(); - if (cpu_hotplug_disabled) + + if (cpu_hotplug_disabled) { err = -EBUSY; - else - err = _cpu_down(cpu, 0); + goto out; + } + + cpu_clear(cpu, cpu_active_map); + + err = _cpu_down(cpu, 0); + + if (cpu_online(cpu)) + cpu_set(cpu, cpu_active_map); +out: cpu_maps_update_done(); return err; } @@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu) } cpu_maps_update_begin(); - if (cpu_hotplug_disabled) + + if (cpu_hotplug_disabled) { err = -EBUSY; - else - err = _cpu_up(cpu, 0); + goto out; + } + err = _cpu_up(cpu, 0); + + if (cpu_online(cpu)) + cpu_set(cpu, cpu_active_map); + +out: cpu_maps_update_done(); return err; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601..3c3ef02 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * partition_sched_domains(). */ -static void rebuild_sched_domains(void) +void rebuild_sched_domains(void) { struct kfifo *q; /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ diff --git a/kernel/sched.c b/kernel/sched.c index 1ee18db..c237624 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) - || unlikely(cpu_is_offline(dest_cpu))) + || unlikely(!cpu_active(dest_cpu))) goto out; /* force the process onto the specified CPU */ @@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick) /* * If we are going offline and still the leader, give up! */ - if (cpu_is_offline(cpu) && + if (!cpu_active(cpu) && atomic_read(&nohz.load_balancer) == cpu) { if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); @@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) struct rq *rq_dest, *rq_src; int ret = 0, on_rq; - if (unlikely(cpu_is_offline(dest_cpu))) + if (unlikely(!cpu_active(dest_cpu))) return ret; rq_src = cpu_rq(src_cpu); @@ -7554,18 +7554,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void) } /* - * Free current domain masks. - * Called after all cpus are attached to NULL domain. - */ -static void free_sched_domains(void) -{ - ndoms_cur = 0; - if (doms_cur != &fallback_doms) - kfree(doms_cur); - doms_cur = &fallback_doms; -} - -/* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. @@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ownership of it and will kfree it when done with it. If the caller * failed the kmalloc call, then it can pass in doms_new == NULL, * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms'. + * 'fallback_doms', it also forces the domains to be rebuilt. * * Call with hotplug lock held */ @@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) { - ndoms_new = 1; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; - } + if (doms_new == NULL) + ndoms_new = 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { @@ -7677,6 +7661,14 @@ match1: ; } + if (doms_new == NULL) { + ndoms_cur = 0; + ndoms_new = 1; + doms_new = &fallback_doms; + cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; + } + /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { @@ -7707,17 +7699,10 @@ match2: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int arch_reinit_sched_domains(void) { - int err; - get_online_cpus(); - mutex_lock(&sched_domains_mutex); - detach_destroy_domains(&cpu_online_map); - free_sched_domains(); - err = arch_init_sched_domains(&cpu_online_map); - mutex_unlock(&sched_domains_mutex); + rebuild_sched_domains(); put_online_cpus(); - - return err; + return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) @@ -7783,59 +7768,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +#ifndef CONFIG_CPUSETS /* - * Force a reinitialization of the sched domains hierarchy. The domains - * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to the NULL domain - * which will prevent rebalancing while the sched domains are recalculated. + * Add online and remove offline CPUs from the scheduler domains. + * When cpusets are enabled they take over this function. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + partition_sched_domains(0, NULL, NULL); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ int cpu = (int)(long)hcpu; switch (action) { case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: disable_runtime(cpu_rq(cpu)); - /* fall-through */ - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - detach_destroy_domains(&cpu_online_map); - free_sched_domains(); return NOTIFY_OK; - case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: enable_runtime(cpu_rq(cpu)); - /* fall-through */ - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* - * Fall through and re-initialise the domains. - */ - break; + return NOTIFY_OK; + default: return NOTIFY_DONE; } - -#ifndef CONFIG_CPUSETS - /* - * Create default domain partitioning if cpusets are disabled. - * Otherwise we let cpusets rebuild the domains based on the - * current setup. - */ - - /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(&cpu_online_map); -#endif - - return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7855,8 +7830,15 @@ void __init sched_init_smp(void) cpu_set(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); + +#ifndef CONFIG_CPUSETS /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); +#endif + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2aa987..d924c67 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq) * not idle and an idle cpu is available. The span of cpus to * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. + * Domains may include CPUs that are not usable for migration, + * hence we need to mask them out (cpu_active_map) * * Returns the CPU we should wake onto. */ @@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); + cpus_and(tmp, tmp, cpu_active_map); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) { if (i != task_cpu(p)) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d3d1ccc..50735bb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -934,6 +934,13 @@ static int find_lowest_rq(struct task_struct *task) return -1; /* No targets found */ /* + * Only consider CPUs that are usable for migration. + * I guess we might want to change cpupri_find() to ignore those + * in the first place. + */ + cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + + /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. -- cgit v1.1 From 39b0fad7121eace85770e7a4c6dc35dfd2879768 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 15 Jul 2008 20:56:26 -0700 Subject: cpu hotplug: Make cpu_active_map synchronization dependency clear This goes on top of the cpu_active_map (take 2) patch. Currently we depend on the stop_machine to provide nescessesary synchronization for the cpu_active_map updates. As Dmitry Adamushko pointed this is fragile and is not much clearer than the previous scheme. In other words we do not want to depend on the internal stop machine operation here. So make the synchronization rules clear by doing synchronize_sched() after clearing out cpu active bit. Tested on quad-Core2 with: while true; do for i in 1 2 3; do echo 0 > /sys/devices/system/cpu/cpu$i/online done for i in 1 2 3; do echo 1 > /sys/devices/system/cpu/cpu$i/online done done and stress -c 200 No lockdep, preempt or other complaints. Signed-off-by: Max Krasnyansky Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/cpu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a1ac7ea..033603c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -301,6 +301,16 @@ int __ref cpu_down(unsigned int cpu) cpu_clear(cpu, cpu_active_map); + /* + * Make sure the all cpus did the reschedule and are not + * using stale version of the cpu_active_map. + * This is not strictly necessary becuase stop_machine() + * that we run down the line already provides the required + * synchronization. But it's really a side effect and we do not + * want to depend on the innards of the stop_machine here. + */ + synchronize_sched(); + err = _cpu_down(cpu, 0); if (cpu_online(cpu)) -- cgit v1.1 From 577b4a58d2e74a4d48050eeea3e3f952ce04eb86 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Jul 2008 13:34:54 +0100 Subject: sched: fix warning in inc_rt_tasks() to not declare variable 'rq' if it's not needed Fix inc_rt_tasks() to not declare variable 'rq' if it's not needed. It is declared if CONFIG_SMP or CONFIG_RT_GROUP_SCHED, but only used if CONFIG_SMP. This is a consequence of patch 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 plus patch 1100ac91b6af02d8639d518fad5b434b1bf44ed6. Signed-off-by: David Howells Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 47ceac9..147004c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -505,7 +505,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) { +#ifdef CONFIG_SMP struct rq *rq = rq_of_rt_rq(rt_rq); +#endif rt_rq->highest_prio = rt_se_prio(rt_se); #ifdef CONFIG_SMP -- cgit v1.1 From b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2008 17:27:28 +0200 Subject: nohz: prevent tick stop outside of the idle loop Jack Ren and Eric Miao tracked down the following long standing problem in the NOHZ code: scheduler switch to idle task enable interrupts Window starts here ----> interrupt happens (does not set NEED_RESCHED) irq_exit() stops the tick ----> interrupt happens (does set NEED_RESCHED) return from schedule() cpu_idle(): preempt_disable(); Window ends here The interrupts can happen at any point inside the race window. The first interrupt stops the tick, the second one causes the scheduler to rerun and switch away from idle again and we end up with the tick disabled. The fact that it needs two interrupts where the first one does not set NEED_RESCHED and the second one does made the bug obscure and extremly hard to reproduce and analyse. Kudos to Jack and Eric. Solution: Limit the NOHZ functionality to the idle loop to make sure that we can not run into such a situation ever again. cpu_idle() { preempt_disable(); while(1) { tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we are in the idle loop while (!need_resched()) halt(); tick_nohz_restart_sched_tick(); <- disables NOHZ mode preempt_enable_no_resched(); schedule(); preempt_disable(); } } In hindsight we should have done this forever, but ... /me grabs a large brown paperbag. Debugged-by: Jack Ren , Debugged-by: eric miao Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 2 +- kernel/time/tick-sched.c | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 36e0617..05f2480 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -312,7 +312,7 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(0); rcu_irq_exit(); #endif preempt_enable_no_resched(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 86baa4f..ee962d1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -195,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) * Called either from the idle loop or from irq_exit() when an idle period was * just interrupted by an interrupt which did not cause a reschedule. */ -void tick_nohz_stop_sched_tick(void) +void tick_nohz_stop_sched_tick(int inidle) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; struct tick_sched *ts; @@ -224,6 +224,11 @@ void tick_nohz_stop_sched_tick(void) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; + if (!inidle && !ts->inidle) + goto end; + + ts->inidle = 1; + if (need_resched()) goto end; @@ -372,11 +377,14 @@ void tick_nohz_restart_sched_tick(void) local_irq_disable(); tick_nohz_stop_idle(cpu); - if (!ts->tick_stopped) { + if (!ts->inidle || !ts->tick_stopped) { + ts->inidle = 0; local_irq_enable(); return; } + ts->inidle = 0; + rcu_exit_nohz(); /* Update jiffies first */ -- cgit v1.1 From 4dca10a96041f78bed11ce9e4a5cfde813ec4ccb Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 7 Jul 2008 18:37:04 -0700 Subject: softlockup: fix invalid proc_handler for softlockup_panic The type of softlockup_panic is int, but the proc_handler is proc_doulongvec_minmax(). This handler is for unsigned long. This handler should be proc_dointvec_minmax(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e56d2f..ab59ac0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -741,7 +741,7 @@ static struct ctl_table kern_table[] = { .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one, -- cgit v1.1 From 8df185a95c9b84fc0c3c02224e64fdc5b83bae34 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 8 Jul 2008 15:55:48 -0700 Subject: kthread: reduce stack pressure in create_kthread and kthreadd * Replace: set_cpus_allowed(..., CPU_MASK_ALL) with: set_cpus_allowed_ptr(..., CPU_MASK_ALL_PTR) to remove excessive stack requirements when NR_CPUS=4096. Signed-off-by: Mike Travis Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ac3fb73..6111c27 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create) */ sched_setscheduler(create->result, SCHED_NORMAL, ¶m); set_user_nice(create->result, KTHREAD_NICE_LEVEL); - set_cpus_allowed(create->result, CPU_MASK_ALL); + set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); } complete(&create->done); } @@ -233,7 +233,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_user_nice(tsk, KTHREAD_NICE_LEVEL); - set_cpus_allowed(tsk, CPU_MASK_ALL); + set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; -- cgit v1.1 From 65c011845316d3c1381f478ca0d8265c43b3b039 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:30 -0700 Subject: cpumask: Replace cpumask_of_cpu with cpumask_of_cpu_ptr * This patch replaces the dangerous lvalue version of cpumask_of_cpu with new cpumask_of_cpu_ptr macros. These are patterned after the node_to_cpumask_ptr macros. In general terms, if there is a cpumask_of_cpu_map[] then a pointer to the cpumask_of_cpu_map[cpu] entry is used. The cpumask_of_cpu_map is provided when there is a large NR_CPUS count, reducing greatly the amount of code generated and stack space used for cpumask_of_cpu(). The pointer to the cpumask_t value is needed for calling set_cpus_allowed_ptr() to reduce the amount of stack space needed to pass the cpumask_t value. If there isn't a cpumask_of_cpu_map[], then a temporary variable is declared and filled in with value from cpumask_of_cpu(cpu) as well as a pointer variable pointing to this temporary variable. Afterwards, the pointer is used to reference the cpumask value. The compiler will optimize out the extra dereference through the pointer as well as the stack space used for the pointer, resulting in identical code. A good example of the orthogonal usages is in net/sunrpc/svc.c: case SVC_POOL_PERCPU: { unsigned int cpu = m->pool_to[pidx]; cpumask_of_cpu_ptr(cpumask, cpu); *oldmask = current->cpus_allowed; set_cpus_allowed_ptr(current, cpumask); return 1; } case SVC_POOL_PERNODE: { unsigned int node = m->pool_to[pidx]; node_to_cpumask_ptr(nodecpumask, node); *oldmask = current->cpus_allowed; set_cpus_allowed_ptr(current, nodecpumask); return 1; } Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 3 ++- kernel/trace/trace_sysprof.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ba9b205..738b411 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -33,8 +33,9 @@ static int stopmachine(void *cpu) { int irqs_disabled = 0; int prepared = 0; + cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); + set_cpus_allowed_ptr(current, cpumask); /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 2301e1e..6352808 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -213,7 +213,9 @@ static void start_stack_timers(void) int cpu; for_each_online_cpu(cpu) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + cpumask_of_cpu_ptr(new_mask, cpu); + + set_cpus_allowed_ptr(current, new_mask); start_stack_timer(cpu); } set_cpus_allowed_ptr(current, &saved_mask); -- cgit v1.1 From c18a41fbbc500ac0307ffd2b0ae73c2af9d0b0ab Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:34 -0700 Subject: cpumask: Optimize cpumask_of_cpu in kernel/time/tick-common.c * Optimize various places where a pointer to the cpumask_of_cpu value will result in reducing stack pressure. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/time/tick-common.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 4f38865..bf43284 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -135,7 +135,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, - cpumask_t cpumask) + const cpumask_t *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; @@ -169,8 +169,8 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpus_equal(newdev->cpumask, cpumask)) - irq_set_affinity(newdev->irq, cpumask); + if (!cpus_equal(newdev->cpumask, *cpumask)) + irq_set_affinity(newdev->irq, *cpumask); /* * When global broadcasting is active, check if the current @@ -196,20 +196,20 @@ static int tick_check_new_device(struct clock_event_device *newdev) struct tick_device *td; int cpu, ret = NOTIFY_OK; unsigned long flags; - cpumask_t cpumask; + cpumask_of_cpu_ptr_declare(cpumask); spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); + cpumask_of_cpu_ptr_next(cpumask, cpu); if (!cpu_isset(cpu, newdev->cpumask)) goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; - cpumask = cpumask_of_cpu(cpu); /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask)) { + if (!cpus_equal(newdev->cpumask, *cpumask)) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, cpumask)) + if (curdev && cpus_equal(curdev->cpumask, *cpumask)) goto out_bc; } -- cgit v1.1 From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Jul 2008 18:01:23 +0200 Subject: sched, x86: clean up hrtick implementation random uvesafb failures were reported against Gentoo: http://bugs.gentoo.org/show_bug.cgi?id=222799 and Mihai Moldovan bisected it back to: > 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit > commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f > Author: Peter Zijlstra > Date: Fri Jan 25 21:08:29 2008 +0100 > > sched: high-res preemption tick Linus suspected it to be hrtick + vm86 interaction and observed: > Btw, Peter, Ingo: I think that commit is doing bad things. They aren't > _incorrect_ per se, but they are definitely bad. > > Why? > > Using random _TIF_WORK_MASK flags is really impolite for doing > "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S > special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of > vm86 mode unnecessarily. > > See the "work_notifysig_v86" label, and how it does that > "save_v86_state()" thing etc etc. Right, I never liked having to fiddle with those TIF flags. Initially I needed it because the hrtimer base lock could not nest in the rq lock. That however is fixed these days. Currently the only reason left to fiddle with the TIF flags is remote wakeups. We cannot program a remote cpu's hrtimer. I've been thinking about using the new and improved IPI function call stuff to implement hrtimer_start_on(). However that does require that smp_call_function_single(.wait=0) works from interrupt context - /me looks at the latest series from Jens - Yes that does seem to be supported, good. Here's a stab at cleaning this stuff up ... Mihai reported test success as well. Signed-off-by: Peter Zijlstra Tested-by: Mihai Moldovan Cc: Michal Januszewski Cc: Antonino Daplas Signed-off-by: Ingo Molnar --- kernel/Kconfig.hz | 2 +- kernel/sched.c | 202 ++++++++++++++++------------------------------------ kernel/sched_fair.c | 5 +- 3 files changed, 63 insertions(+), 146 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 526128a..2a202a8 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && X86 + def_bool HIGH_RES_TIMERS diff --git a/kernel/sched.c b/kernel/sched.c index 1ee18db..c13c75e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -571,8 +571,10 @@ struct rq { #endif #ifdef CONFIG_SCHED_HRTICK - unsigned long hrtick_flags; - ktime_t hrtick_expire; +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif struct hrtimer hrtick_timer; #endif @@ -983,13 +985,6 @@ static struct rq *this_rq_lock(void) return rq; } -static void __resched_task(struct task_struct *p, int tif_bit); - -static inline void resched_task(struct task_struct *p) -{ - __resched_task(p, TIF_NEED_RESCHED); -} - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -1001,25 +996,6 @@ static inline void resched_task(struct task_struct *p) * When we get rescheduled we reprogram the hrtick_timer outside of the * rq->lock. */ -static inline void resched_hrt(struct task_struct *p) -{ - __resched_task(p, TIF_HRTICK_RESCHED); -} - -static inline void resched_rq(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - resched_task(rq->curr); - spin_unlock_irqrestore(&rq->lock, flags); -} - -enum { - HRTICK_SET, /* re-programm hrtick_timer */ - HRTICK_RESET, /* not a new slice */ - HRTICK_BLOCK, /* stop hrtick operations */ -}; /* * Use hrtick when: @@ -1030,40 +1006,11 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; - if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) + if (!cpu_online(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay, int reset) -{ - assert_spin_locked(&rq->lock); - - /* - * preempt at: now + delay - */ - rq->hrtick_expire = - ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); - /* - * indicate we need to program the timer - */ - __set_bit(HRTICK_SET, &rq->hrtick_flags); - if (reset) - __set_bit(HRTICK_RESET, &rq->hrtick_flags); - - /* - * New slices are called from the schedule path and don't need a - * forced reschedule. - */ - if (reset) - resched_hrt(rq->curr); -} - static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -1071,32 +1018,6 @@ static void hrtick_clear(struct rq *rq) } /* - * Update the timer from the possible pending state. - */ -static void hrtick_set(struct rq *rq) -{ - ktime_t time; - int set, reset; - unsigned long flags; - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - spin_lock_irqsave(&rq->lock, flags); - set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); - reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); - time = rq->hrtick_expire; - clear_thread_flag(TIF_HRTICK_RESCHED); - spin_unlock_irqrestore(&rq->lock, flags); - - if (set) { - hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); - if (reset && !hrtimer_active(&rq->hrtick_timer)) - resched_rq(rq); - } else - hrtick_clear(rq); -} - -/* * High-resolution timer tick. * Runs from hardirq context with interrupts disabled. */ @@ -1115,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) } #ifdef CONFIG_SMP -static void hotplug_hrtick_disable(int cpu) +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - rq->hrtick_flags = 0; - __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); - spin_unlock_irqrestore(&rq->lock, flags); + struct rq *rq = arg; - hrtick_clear(rq); + spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + spin_unlock(&rq->lock); } -static void hotplug_hrtick_enable(int cpu) +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - spin_lock_irqsave(&rq->lock, flags); - __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); - spin_unlock_irqrestore(&rq->lock, flags); + timer->expires = time; + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + rq->hrtick_csd_pending = 1; + } } static int @@ -1150,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - hotplug_hrtick_disable(cpu); - return NOTIFY_OK; - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hotplug_hrtick_enable(cpu); + hrtick_clear(cpu_rq(cpu)); return NOTIFY_OK; } @@ -1170,46 +1092,45 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -#endif /* CONFIG_SMP */ +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); +} -static void init_rq_hrtick(struct rq *rq) +static void init_hrtick(void) { - rq->hrtick_flags = 0; - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +#endif /* CONFIG_SMP */ -void hrtick_resched(void) +static void init_rq_hrtick(struct rq *rq) { - struct rq *rq; - unsigned long flags; +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; - if (!test_thread_flag(TIF_HRTICK_RESCHED)) - return; + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif - local_irq_save(flags); - rq = cpu_rq(smp_processor_id()); - hrtick_set(rq); - local_irq_restore(flags); + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } #else static inline void hrtick_clear(struct rq *rq) { } -static inline void hrtick_set(struct rq *rq) -{ -} - static inline void init_rq_hrtick(struct rq *rq) { } -void hrtick_resched(void) -{ -} - static inline void init_hrtick(void) { } @@ -1228,16 +1149,16 @@ static inline void init_hrtick(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void __resched_task(struct task_struct *p, int tif_bit) +static void resched_task(struct task_struct *p) { int cpu; assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, tif_bit))) + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) return; - set_tsk_thread_flag(p, tif_bit); + set_tsk_thread_flag(p, TIF_NEED_RESCHED); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1303,10 +1224,10 @@ void wake_up_idle_cpu(int cpu) #endif /* CONFIG_NO_HZ */ #else /* !CONFIG_SMP */ -static void __resched_task(struct task_struct *p, int tif_bit) +static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); - set_tsk_thread_flag(p, tif_bit); + set_tsk_need_resched(p); } #endif /* CONFIG_SMP */ @@ -4395,7 +4316,7 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; - int cpu, hrtick = sched_feat(HRTICK); + int cpu; need_resched: preempt_disable(); @@ -4410,7 +4331,7 @@ need_resched_nonpreemptible: schedule_debug(prev); - if (hrtick) + if (sched_feat(HRTICK)) hrtick_clear(rq); /* @@ -4457,9 +4378,6 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); - if (hrtick) - hrtick_set(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2aa987..6893b3e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -878,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_SCHED_HRTICK static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { - int requeue = rq->curr == p; struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -899,10 +898,10 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) * Don't schedule slices shorter than 10000ns, that just * doesn't make sense. Rely on vruntime for fairness. */ - if (!requeue) + if (rq->curr != p) delta = max(10000LL, delta); - hrtick_start(rq, delta, requeue); + hrtick_start(rq, delta); } } #else /* !CONFIG_SCHED_HRTICK */ -- cgit v1.1 From ba42059fbd0aa1ac91b582412b5fedb1258f241f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 20 Jul 2008 11:02:06 +0200 Subject: sched: hrtick_enabled() should use cpu_active() Peter pointed out that hrtick_enabled() should use cpu_active(). Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 85cf246..62b1b8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1006,7 +1006,7 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; - if (!cpu_online(cpu_of(rq))) + if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } -- cgit v1.1 From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 1 Jul 2008 18:48:41 +0200 Subject: sysdev: Pass the attribute to the low level sysdev show/store function This allow to dynamically generate attributes and share show/store functions between attributes. Right now most attributes are generated by special macros and lots of duplicated code. With the attribute passed it's instead possible to attach some data to the attribute and then use that in shared low level functions to do different things. I need this for the dynamically generated bank attributes in the x86 machine check code, but it'll allow some further cleanups. I converted all users in tree to the new show/store prototype. It's a single huge patch to avoid unbisectable sections. Runtime tested: x86-32, x86-64 Compiled only: ia64, powerpc Not compile tested/only grep converted: sh, arm, avr32 Signed-off-by: Andi Kleen Signed-off-by: Greg Kroah-Hartman --- kernel/rtmutex-tester.c | 7 ++++--- kernel/sched.c | 8 ++++++-- kernel/time/clocksource.c | 8 ++++++-- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 092e4c6..a56f629 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -297,8 +297,8 @@ static int test_func(void *data) * * opcode:data */ -static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, - size_t count) +static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, + const char *buf, size_t count) { struct sched_param schedpar; struct test_thread_data *td; @@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, * @dev: thread to query * @buf: char buffer to be filled with thread status info */ -static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) +static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) { struct test_thread_data *td; struct task_struct *tsk; diff --git a/kernel/sched.c b/kernel/sched.c index 99e6d85..b1104ea 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7737,11 +7737,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } static ssize_t sched_mc_power_savings_store(struct sys_device *dev, + struct sysdev_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); @@ -7751,11 +7753,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } static ssize_t sched_smt_power_savings_store(struct sys_device *dev, + struct sysdev_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dadde53..b1c2da8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs) * Provides sysfs interface for listing current clocksource. */ static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +sysfs_show_current_clocksources(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { ssize_t count = 0; @@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) * clocksource selction. */ static ssize_t sysfs_override_clocksource(struct sys_device *dev, + struct sysdev_attribute *attr, const char *buf, size_t count) { struct clocksource *ovr = NULL; @@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Provides sysfs interface for listing registered clocksources */ static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +sysfs_show_available_clocksources(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) { struct clocksource *src; ssize_t count = 0; -- cgit v1.1 From da39ba5e1d65e997a98f6eb93ba6e6eb505f6e3c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 22 Jul 2008 19:24:25 -0500 Subject: module: don't use stop_machine for waiting rmmod rmmod has a little-used "-w" option, meaning that instead of failing if the module is in use, it should block until the module becomes unused. In this case, we don't need to use stop_machine: Max Krasnyansky indicated that would be useful for SystemTap which loads/unloads new modules frequently. Cc: Max Krasnyansky Signed-off-by: Rusty Russell --- kernel/module.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5f80478..705e1d5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -639,8 +639,8 @@ static int __try_stop_module(void *_sref) { struct stopref *sref = _sref; - /* If it's not unused, quit unless we are told to block. */ - if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { + /* If it's not unused, quit unless we're forcing. */ + if (module_refcount(sref->mod) != 0) { if (!(*sref->forced = try_force_unload(sref->flags))) return -EWOULDBLOCK; } @@ -652,9 +652,16 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - struct stopref sref = { mod, flags, forced }; + if (flags & O_NONBLOCK) { + struct stopref sref = { mod, flags, forced }; - return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + } else { + /* We don't need to stop the machine for this. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + return 0; + } } unsigned int module_refcount(struct module *mod) -- cgit v1.1 From dafd0940c96fec67974a88ed8e6b8ba3160394cd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 22 Jul 2008 19:24:25 -0500 Subject: module: generic each_symbol iterator function Introduce an each_symbol() iterator to avoid duplicating the knowledge about the 5 different sections containing symbols. Currently only used by find_symbol(), but will be used by symbol_put_addr() too. (Includes NULL ptr deref fix by Jiri Kosina ) Signed-off-by: Rusty Russell Cc: Jiri Kosina --- kernel/module.c | 244 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 134 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 705e1d5..c51c089 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -152,156 +152,180 @@ extern const unsigned long __start___kcrctab_unused_gpl[]; #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; -} - -static bool always_ok(bool gplok, bool warn, const char *name) -{ - return true; -} - -static bool printk_unused_warning(bool gplok, bool warn, const char *name) -{ - if (warn) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", name); - printk(KERN_WARNING - "This symbol will go away in the future.\n"); - printk(KERN_WARNING - "Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); - } - return true; -} - -static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name) -{ - if (!gplok) - return false; - return printk_unused_warning(gplok, warn, name); -} - -static bool gpl_only(bool gplok, bool warn, const char *name) -{ - return gplok; -} - -static bool warn_if_not_gpl(bool gplok, bool warn, const char *name) -{ - if (!gplok && warn) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); - } - return true; -} - struct symsearch { const struct kernel_symbol *start, *stop; const unsigned long *crcs; - bool (*check)(bool gplok, bool warn, const char *name); + enum { + NOT_GPL_ONLY, + GPL_ONLY, + WILL_BE_GPL_ONLY, + } licence; + bool unused; }; -/* Look through this array of symbol tables for a symbol match which - * passes the check function. */ -static const struct kernel_symbol *search_symarrays(const struct symsearch *arr, - unsigned int num, - const char *name, - bool gplok, - bool warn, - const unsigned long **crc) +static bool each_symbol_in_section(const struct symsearch *arr, + unsigned int arrsize, + struct module *owner, + bool (*fn)(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data), + void *data) { - unsigned int i; - const struct kernel_symbol *ks; - - for (i = 0; i < num; i++) { - ks = lookup_symbol(name, arr[i].start, arr[i].stop); - if (!ks || !arr[i].check(gplok, warn, name)) - continue; + unsigned int i, j; - if (crc) - *crc = symversion(arr[i].crcs, ks - arr[i].start); - return ks; + for (j = 0; j < arrsize; j++) { + for (i = 0; i < arr[j].stop - arr[j].start; i++) + if (fn(&arr[j], owner, i, data)) + return true; } - return NULL; + + return false; } -/* Find a symbol, return value, (optional) crc and (optional) module - * which owns it */ -static unsigned long find_symbol(const char *name, - struct module **owner, - const unsigned long **crc, - bool gplok, - bool warn) +/* Returns true as soon as fn returns true, otherwise false. */ +static bool each_symbol(bool (*fn)(const struct symsearch *arr, + struct module *owner, + unsigned int symnum, void *data), + void *data) { struct module *mod; - const struct kernel_symbol *ks; const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - always_ok }, + NOT_GPL_ONLY, false }, { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, gpl_only }, + __start___kcrctab_gpl, + GPL_ONLY, false }, { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, - __start___kcrctab_gpl_future, warn_if_not_gpl }, + __start___kcrctab_gpl_future, + WILL_BE_GPL_ONLY, false }, { __start___ksymtab_unused, __stop___ksymtab_unused, - __start___kcrctab_unused, printk_unused_warning }, + __start___kcrctab_unused, + NOT_GPL_ONLY, true }, { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, - __start___kcrctab_unused_gpl, gpl_only_unused_warning }, + __start___kcrctab_unused_gpl, + GPL_ONLY, true }, }; - /* Core kernel first. */ - ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc); - if (ks) { - if (owner) - *owner = NULL; - return ks->value; - } + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) + return true; - /* Now try modules. */ list_for_each_entry(mod, &modules, list) { struct symsearch arr[] = { { mod->syms, mod->syms + mod->num_syms, mod->crcs, - always_ok }, + NOT_GPL_ONLY, false }, { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, gpl_only }, + mod->gpl_crcs, + GPL_ONLY, false }, { mod->gpl_future_syms, mod->gpl_future_syms + mod->num_gpl_future_syms, - mod->gpl_future_crcs, warn_if_not_gpl }, + mod->gpl_future_crcs, + WILL_BE_GPL_ONLY, false }, { mod->unused_syms, mod->unused_syms + mod->num_unused_syms, - mod->unused_crcs, printk_unused_warning }, + mod->unused_crcs, + NOT_GPL_ONLY, true }, { mod->unused_gpl_syms, mod->unused_gpl_syms + mod->num_unused_gpl_syms, - mod->unused_gpl_crcs, gpl_only_unused_warning }, + mod->unused_gpl_crcs, + GPL_ONLY, true }, }; - ks = search_symarrays(arr, ARRAY_SIZE(arr), - name, gplok, warn, crc); - if (ks) { - if (owner) - *owner = mod; - return ks->value; + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) + return true; + } + return false; +} + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const unsigned long *crc; + unsigned long value; +}; + +static bool find_symbol_in_section(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) +{ + struct find_symbol_arg *fsa = data; + + if (strcmp(syms->start[symnum].name, fsa->name) != 0) + return false; + + if (!fsa->gplok) { + if (syms->licence == GPL_ONLY) + return false; + if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", fsa->name); + printk(KERN_WARNING "Please see the file " + "Documentation/feature-removal-schedule.txt " + "in the kernel source tree for more details.\n"); } } + if (syms->unused && fsa->warn) { + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", fsa->name); + printk(KERN_WARNING + "This symbol will go away in the future.\n"); + printk(KERN_WARNING + "Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); + } + + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, symnum); + fsa->value = syms->start[symnum].value; + return true; +} + +/* Find a symbol, return value, (optional) crc and (optional) module + * which owns it */ +static unsigned long find_symbol(const char *name, + struct module **owner, + const unsigned long **crc, + bool gplok, + bool warn) +{ + struct find_symbol_arg fsa; + + fsa.name = name; + fsa.gplok = gplok; + fsa.warn = warn; + + if (each_symbol(find_symbol_in_section, &fsa)) { + if (owner) + *owner = fsa.owner; + if (crc) + *crc = fsa.crc; + return fsa.value; + } + DEBUGP("Failed to find symbol %s\n", name); return -ENOENT; } +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + const struct kernel_symbol *ks = start; + for (; ks < stop; ks++) + if (strcmp(ks->name, name) == 0) + return ks; + return NULL; +} + /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { -- cgit v1.1 From f7f5b67557eac1131ba6532522e3c50eced34238 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 22 Jul 2008 19:24:26 -0500 Subject: Shrink struct module: CONFIG_UNUSED_SYMBOLS ifdefs module.c and module.h conatains code for finding exported symbols which are declared with EXPORT_UNUSED_SYMBOL, and this code is compiled in even if CONFIG_UNUSED_SYMBOLS is not set and thus there can be no EXPORT_UNUSED_SYMBOLs in modules anyway (because EXPORT_UNUSED_SYMBOL(x) are compiled out to nothing then). This patch adds required #ifdefs. Signed-off-by: Denys Vlasenko Signed-off-by: Rusty Russell --- kernel/module.c | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c51c089..ea95805 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -134,17 +134,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const struct kernel_symbol __start___ksymtab_unused[]; -extern const struct kernel_symbol __stop___ksymtab_unused[]; -extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const unsigned long __start___kcrctab[]; extern const unsigned long __start___kcrctab_gpl[]; extern const unsigned long __start___kcrctab_gpl_future[]; +#ifdef CONFIG_UNUSED_SYMBOLS +extern const struct kernel_symbol __start___ksymtab_unused[]; +extern const struct kernel_symbol __stop___ksymtab_unused[]; +extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; extern const unsigned long __start___kcrctab_unused[]; extern const unsigned long __start___kcrctab_unused_gpl[]; +#endif #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL @@ -198,12 +200,14 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr, { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, __start___kcrctab_gpl_future, WILL_BE_GPL_ONLY, false }, +#ifdef CONFIG_UNUSED_SYMBOLS { __start___ksymtab_unused, __stop___ksymtab_unused, __start___kcrctab_unused, NOT_GPL_ONLY, true }, { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, __start___kcrctab_unused_gpl, GPL_ONLY, true }, +#endif }; if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) @@ -220,6 +224,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr, mod->gpl_future_syms + mod->num_gpl_future_syms, mod->gpl_future_crcs, WILL_BE_GPL_ONLY, false }, +#ifdef CONFIG_UNUSED_SYMBOLS { mod->unused_syms, mod->unused_syms + mod->num_unused_syms, mod->unused_crcs, @@ -228,6 +233,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr, mod->unused_gpl_syms + mod->num_unused_gpl_syms, mod->unused_gpl_crcs, GPL_ONLY, true }, +#endif }; if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) @@ -270,6 +276,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, } } +#ifdef CONFIG_UNUSED_SYMBOLS if (syms->unused && fsa->warn) { printk(KERN_WARNING "Symbol %s is marked as UNUSED, " "however this module is using it.\n", fsa->name); @@ -281,6 +288,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, "mailinglist together with submitting your code for " "inclusion.\n"); } +#endif fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); @@ -1476,8 +1484,10 @@ static int verify_export_symbols(struct module *mod) { mod->syms, mod->num_syms }, { mod->gpl_syms, mod->num_gpl_syms }, { mod->gpl_future_syms, mod->num_gpl_future_syms }, +#ifdef CONFIG_UNUSED_SYMBOLS { mod->unused_syms, mod->num_unused_syms }, { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, +#endif }; for (i = 0; i < ARRAY_SIZE(arr); i++) { @@ -1795,10 +1805,12 @@ static struct module *load_module(void __user *umod, unsigned int gplfutureindex; unsigned int gplfuturecrcindex; unsigned int unwindex = 0; +#ifdef CONFIG_UNUSED_SYMBOLS unsigned int unusedindex; unsigned int unusedcrcindex; unsigned int unusedgplindex; unsigned int unusedgplcrcindex; +#endif unsigned int markersindex; unsigned int markersstringsindex; struct module *mod; @@ -1881,13 +1893,15 @@ static struct module *load_module(void __user *umod, exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); - unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); - unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); +#ifdef CONFIG_UNUSED_SYMBOLS + unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); + unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); +#endif setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); @@ -2049,14 +2063,15 @@ static struct module *load_module(void __user *umod, mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / sizeof(*mod->gpl_future_syms); - mod->num_unused_syms = sechdrs[unusedindex].sh_size / - sizeof(*mod->unused_syms); - mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / - sizeof(*mod->unused_gpl_syms); mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; if (gplfuturecrcindex) mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; +#ifdef CONFIG_UNUSED_SYMBOLS + mod->num_unused_syms = sechdrs[unusedindex].sh_size / + sizeof(*mod->unused_syms); + mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / + sizeof(*mod->unused_gpl_syms); mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; if (unusedcrcindex) mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; @@ -2064,13 +2079,17 @@ static struct module *load_module(void __user *umod, if (unusedgplcrcindex) mod->unused_gpl_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; +#endif #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !crcindex) || - (mod->num_gpl_syms && !gplcrcindex) || - (mod->num_gpl_future_syms && !gplfuturecrcindex) || - (mod->num_unused_syms && !unusedcrcindex) || - (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { + if ((mod->num_syms && !crcindex) + || (mod->num_gpl_syms && !gplcrcindex) + || (mod->num_gpl_future_syms && !gplfuturecrcindex) +#ifdef CONFIG_UNUSED_SYMBOLS + || (mod->num_unused_syms && !unusedcrcindex) + || (mod->num_unused_gpl_syms && !unusedgplcrcindex) +#endif + ) { printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); err = try_to_force_load(mod, "nocrc"); if (err) -- cgit v1.1 From 2f0f2a334bc38b61a9afca951185cd3844ee709d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 22 Jul 2008 19:24:27 -0500 Subject: module: turn longs into ints for module sizes This shrinks module.o and each *.ko file. And finally, structure members which hold length of module code (four such members there) and count of symbols are converted from longs to ints. We cannot possibly have a module where 32 bits won't be enough to hold such counts. For one, module loading checks module size for sanity before loading, so such insanely big module will fail that test first. Signed-off-by: Denys Vlasenko Signed-off-by: Rusty Russell --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ea95805..5c7eb069 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1567,7 +1567,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, } /* Update size with this section: return offset. */ -static long get_offset(unsigned long *size, Elf_Shdr *sechdr) +static long get_offset(unsigned int *size, Elf_Shdr *sechdr) { long ret; @@ -2562,7 +2562,7 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[8]; - seq_printf(m, "%s %lu", + seq_printf(m, "%s %u", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); -- cgit v1.1 From 3a642e99babe0617febb6f402e1e063479f489db Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 22 Jul 2008 19:24:28 -0500 Subject: modules: Take a shortcut for checking if an address is in a module This patch keeps track of the boundaries of module allocation, in order to speed up module_text_address(). Inspired by Arjan's version, which required arch-specific defines: Various pieces of the kernel (lockdep, latencytop, etc) tend to store backtraces, sometimes at a relatively high frequency. In itself this isn't a big performance deal (after all you're using diagnostics features), but there have been some complaints from people who have over 100 modules loaded that this is a tad too slow. This is due to the new backtracer code which looks at every slot on the stack to see if it's a kernel/module text address, so that's 1024 slots. 1024 times 100 modules... that's a lot of list walking. Signed-off-by: Rusty Russell --- kernel/module.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5c7eb069..d8b5605 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); +/* Bounds of module allocation, for speeding __module_text_address */ +static unsigned long module_addr_min = -1UL, module_addr_max = 0; + int register_module_notifier(struct notifier_block * nb) { return blocking_notifier_chain_register(&module_notify_list, nb); @@ -1779,6 +1782,20 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +static void *module_alloc_update_bounds(unsigned long size) +{ + void *ret = module_alloc(size); + + if (ret) { + /* Update module bounds. */ + if ((unsigned long)ret < module_addr_min) + module_addr_min = (unsigned long)ret; + if ((unsigned long)ret + size > module_addr_max) + module_addr_max = (unsigned long)ret + size; + } + return ret; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static struct module *load_module(void __user *umod, @@ -1980,7 +1997,7 @@ static struct module *load_module(void __user *umod, layout_sections(mod, hdr, sechdrs, secstrings); /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc_update_bounds(mod->core_size); if (!ptr) { err = -ENOMEM; goto free_percpu; @@ -1988,7 +2005,7 @@ static struct module *load_module(void __user *umod, memset(ptr, 0, mod->core_size); mod->module_core = ptr; - ptr = module_alloc(mod->init_size); + ptr = module_alloc_update_bounds(mod->init_size); if (!ptr && mod->init_size) { err = -ENOMEM; goto free_core; @@ -2645,6 +2662,9 @@ struct module *__module_text_address(unsigned long addr) { struct module *mod; + if (addr < module_addr_min || addr > module_addr_max) + return NULL; + list_for_each_entry(mod, &modules, list) if (within(addr, mod->module_init, mod->init_text_size) || within(addr, mod->module_core, mod->core_text_size)) -- cgit v1.1 From a1ef5adb4cad43460ebba23c5a78cf4a55bb6a5b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Jul 2008 19:00:17 +0200 Subject: remove CONFIG_KMOD from core kernel code Always compile request_module when the kernel allows modules. Signed-off-by: Johannes Berg Signed-off-by: Rusty Russell --- kernel/exec_domain.c | 2 +- kernel/kmod.c | 2 +- kernel/sysctl.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index a9e6bad..c1ef192 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality) goto out; } -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); request_module("personality-%ld", pers); read_lock(&exec_domains_lock); diff --git a/kernel/kmod.c b/kernel/kmod.c index 8df97d3..90d7af1 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -42,7 +42,7 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES /* modprobe_path is set via /proc/sys. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6b16e16..b859e6b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -110,7 +110,7 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES extern char modprobe_path[]; #endif #ifdef CONFIG_CHR_DEV_SG @@ -475,7 +475,7 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES { .ctl_name = KERN_MODPROBE, .procname = "modprobe", -- cgit v1.1 From 91cd4d6ef0abb1f65e81f8fe37e7d3c10344e38c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 21 Jul 2008 14:21:35 -0700 Subject: cpusets: fix wrong domain attr updates Fix wrong domain attr updates, or we will always update the first sched domain attr. Signed-off-by: Miao Xie Cc: Hidetoshi Seto Cc: Paul Jackson Cc: Nick Piggin Cc: Ingo Molnar Cc: [2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601..d2cc67d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -679,7 +679,9 @@ restart: if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; - update_domain_attr(dattr, b); + if (dattr) + update_domain_attr(dattr + + nslot, b); } } nslot++; -- cgit v1.1 From 5f17156fc55abac476d180e480bedb0f07f01b14 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Mon, 21 Jul 2008 14:21:37 -0700 Subject: Fix build on COMPAT platforms when CONFIG_EPOLL is disabled Add missing cond_syscall() entry for compat_sys_epoll_pwait. Signed-off-by: Atsushi Nemoto Cc: Davide Libenzi Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5b9b467..0fea0ee 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -59,6 +59,7 @@ cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); cond_syscall(sys_epoll_pwait); +cond_syscall(compat_sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); -- cgit v1.1 From 422037bafde8083acc3c539ceba3dfc60a04110c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Jul 2008 11:16:38 +0200 Subject: sched: fix hrtick & generic-ipi dependency Andrew Morton reported this s390 allmodconfig build failure: kernel/built-in.o: In function `hrtick_start_fair': sched.c:(.text+0x69c6): undefined reference to `__smp_call_function_single' the reason is that s390 is not a generic-ipi SMP platform yet, while the hrtick code relies on it. Fix the dependency. Signed-off-by: Ingo Molnar --- kernel/Kconfig.hz | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a8..382dd5a 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS + def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS -- cgit v1.1 From 2db873211ba47ef704c301f9ecf4a33413a0b649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 23 Jul 2008 14:42:25 +0200 Subject: set_irq_wake: fix return code and wake status tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 15a647eba94c3da27ccc666bea72e7cca06b2d19 set_irq_wake returned -ENXIO if another device had it already enabled. Zero is the right value to return in this case. Moreover the change to desc->status was not reverted if desc->chip->set_wake returned an error. Signed-off-by: Uwe Kleine-König Acked-by: David Brownell Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 77a51be..3cfc0fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -217,6 +217,17 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); +int set_irq_wake_real(unsigned int irq, unsigned int on) +{ + struct irq_desc *desc = irq_desc + irq; + int ret = -ENXIO; + + if (desc->chip->set_wake) + ret = desc->chip->set_wake(irq, on); + + return ret; +} + /** * set_irq_wake - control irq power management wakeup * @irq: interrupt to control @@ -233,30 +244,34 @@ int set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; unsigned long flags; - int ret = -ENXIO; - int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; + int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ spin_lock_irqsave(&desc->lock, flags); if (on) { - if (desc->wake_depth++ == 0) - desc->status |= IRQ_WAKEUP; - else - set_wake = NULL; + if (desc->wake_depth++ == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 0; + else + desc->status |= IRQ_WAKEUP; + } } else { if (desc->wake_depth == 0) { printk(KERN_WARNING "Unbalanced IRQ %d " "wake disable\n", irq); WARN_ON(1); - } else if (--desc->wake_depth == 0) - desc->status &= ~IRQ_WAKEUP; - else - set_wake = NULL; + } else if (--desc->wake_depth == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 1; + else + desc->status &= ~IRQ_WAKEUP; + } } - if (set_wake) - ret = desc->chip->set_wake(irq, on); + spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.1 From 86a1c34a929f30fde8ad01ea8245df61ddcf58b7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 23 Jun 2008 15:37:04 -0700 Subject: x86_64 syscall audit fast-path This adds a fast path for 64-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. This path does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- kernel/auditsc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c10e7aa..4699950 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1476,7 +1476,8 @@ void audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - BUG_ON(!context); + if (unlikely(!context)) + return; /* * This happens only on certain architectures that make system -- cgit v1.1 From 58838cf3ca3337d76141c33d6c68376490263468 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jul 2008 12:43:13 +0200 Subject: sched: clean up compiler warning Reported-by: Daniel Walker Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 147004c..93ac8ee 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -253,7 +253,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { - do_div(diff, weight); + diff = div_u64((u64)diff, weight); if (rt_rq->rt_runtime + diff > rt_period) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; -- cgit v1.1 From c748e1340e0de3fa7fed86f8bdf499be9242afff Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:27:03 -0700 Subject: mm/vmstat.c: proper externs This patch adds proper extern declarations for five variables in include/linux/vmstat.h Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2a7b9d8..1f7b3b7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -80,7 +81,6 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; -extern int sysctl_stat_interval; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST -- cgit v1.1 From a1e78772d72b2616ed20e54896e68e0e7044854e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:23 -0700 Subject: hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs mappings until fork() This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in a similar manner to the reservations taken for MAP_SHARED mappings. The reserve count is accounted both globally and on a per-VMA basis for private mappings. This guarantees that a process that successfully calls mmap() will successfully fault all pages in the future unless fork() is called. The characteristics of private mappings of hugetlbfs files behaviour after this patch are; 1. The process calling mmap() is guaranteed to succeed all future faults until it forks(). 2. On fork(), the parent may die due to SIGKILL on writes to the private mapping if enough pages are not available for the COW. For reasonably reliable behaviour in the face of a small huge page pool, children of hugepage-aware processes should not reference the mappings; such as might occur when fork()ing to exec(). 3. On fork(), the child VMAs inherit no reserves. Reads on pages already faulted by the parent will succeed. Successful writes will depend on enough huge pages being free in the pool. 4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper and at fault time otherwise. Before this patch, all reads or writes in the child potentially needs page allocations that can later lead to the death of the parent. This applies to reads and writes of uninstantiated pages as well as COW. After the patch it is only a write to an instantiated page that causes problems. Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index adefc11..552c8d8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -307,6 +308,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; -- cgit v1.1 From e5ff215941d59f8ae6bf58f6428dc5c26745a612 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:42 -0700 Subject: hugetlb: multiple hstates for multiple page sizes Add basic support for more than one hstate in hugetlbfs. This is the key to supporting multiple hugetlbfs page sizes at once. - Rather than a single hstate, we now have an array, with an iterator - default_hstate continues to be the struct hstate which we use by default - Add functions for architectures to register new hstates [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1f7b3b7..1a8299d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -959,7 +959,7 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", - .data = &max_huge_pages, + .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_sysctl_handler, @@ -985,10 +985,12 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", - .data = &sysctl_overcommit_huge_pages, - .maxlen = sizeof(sysctl_overcommit_huge_pages), + .data = NULL, + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_overcommit_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, }, #endif { -- cgit v1.1 From ab763c7112ce0e2559c73f921617c81dc7287ca6 Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Wed, 23 Jul 2008 21:28:25 -0700 Subject: security: filesystem capabilities refactor kernel code To date, we've tried hard to confine filesystem support for capabilities to the security modules. This has left a lot of the code in kernel/capability.c in a state where it looks like it supports something that filesystem support for capabilities actually suppresses when the LSM security/commmoncap.c code runs. What is left is a lot of code that uses sub-optimal locking in the main kernel With this change we refactor the main kernel code and make it explicit which locks are needed and that the only remaining kernel races in this area are associated with non-filesystem capability code. Signed-off-by: Andrew G. Morgan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 338 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 221 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 901e0fd..0101e84 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) return 0; } +#ifndef CONFIG_SECURITY_FILE_CAPABILITIES + +/* + * Without filesystem capability support, we nominally support one process + * setting the capabilities of another + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + struct task_struct *target; + int ret; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + if (pid && pid != task_pid_vnr(current)) { + target = find_task_by_vpid(pid); + if (!target) { + ret = -ESRCH; + goto out; + } + } else + target = current; + + ret = security_capget(target, pEp, pIp, pPp); + +out: + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + return ret; +} + +/* + * cap_set_pg - set capabilities for all processes in a given process + * group. We call this holding task_capability_lock and tasklist_lock. + */ +static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct task_struct *g, *target; + int ret = -EPERM; + int found = 0; + struct pid *pgrp; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + pgrp = find_vpid(pgrp_nr); + do_each_pid_task(pgrp, PIDTYPE_PGID, g) { + target = g; + while_each_thread(g, target) { + if (!security_capset_check(target, effective, + inheritable, permitted)) { + security_capset_set(target, effective, + inheritable, permitted); + ret = 0; + } + found = 1; + } + } while_each_pid_task(pgrp, PIDTYPE_PGID, g); + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + if (!found) + ret = 0; + return ret; +} + /* - * For sys_getproccap() and sys_setproccap(), any of the three - * capability set pointers may be NULL -- indicating that that set is - * uninteresting and/or not to be changed. + * cap_set_all - set capabilities for all processes other than init + * and self. We call this holding task_capability_lock and tasklist_lock. */ +static inline int cap_set_all(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct task_struct *g, *target; + int ret = -EPERM; + int found = 0; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + do_each_thread(g, target) { + if (target == current + || is_container_init(target->group_leader)) + continue; + found = 1; + if (security_capset_check(target, effective, inheritable, + permitted)) + continue; + ret = 0; + security_capset_set(target, effective, inheritable, permitted); + } while_each_thread(g, target); + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + if (!found) + ret = 0; + + return ret; +} + +/* + * Given the target pid does not refer to the current process we + * need more elaborate support... (This support is not present when + * filesystem capabilities are configured.) + */ +static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct task_struct *target; + int ret; + + if (!capable(CAP_SETPCAP)) + return -EPERM; + + if (pid == -1) /* all procs other than current and init */ + return cap_set_all(effective, inheritable, permitted); + + else if (pid < 0) /* all procs in process group */ + return cap_set_pg(-pid, effective, inheritable, permitted); + + /* target != current */ + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else { + ret = security_capset_check(target, effective, inheritable, + permitted); + + /* having verified that the proposed changes are legal, + we now put them into effect. */ + if (!ret) + security_capset_set(target, effective, inheritable, + permitted); + } + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + return ret; +} + +#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */ + +/* + * If we have configured with filesystem capability support, then the + * only thing that can change the capabilities of the current process + * is the current process. As such, we can't be in this code at the + * same time as we are in the process of setting capabilities in this + * process. The net result is that we can limit our use of locks to + * when we are reading the caps of another process. + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + int ret; + + if (pid && (pid != task_pid_vnr(current))) { + struct task_struct *target; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else + ret = security_capget(target, pEp, pIp, pPp); + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + } else + ret = security_capget(current, pEp, pIp, pPp); + + return ret; +} + +/* + * With filesystem capability support configured, the kernel does not + * permit the changing of capabilities in one process by another + * process. (CAP_SETPCAP has much less broad semantics when configured + * this way.) + */ +static inline int do_sys_capset_other_tasks(pid_t pid, + kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + return -EPERM; +} + +#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */ /* * Atomically modify the effective capabilities returning the original @@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; @@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) if (pid < 0) return -EINVAL; - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = security_capget(target, &pE, &pI, &pP); - -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); + ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (!ret) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; @@ -216,7 +396,6 @@ out: * before modification is attempted and the application * fails. */ - if (copy_to_user(dataptr, kdata, tocopy * sizeof(struct __user_cap_data_struct))) { return -EFAULT; @@ -226,70 +405,8 @@ out: return ret; } -/* - * cap_set_pg - set capabilities for all processes in a given process - * group. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - struct pid *pgrp; - - pgrp = find_vpid(pgrp_nr); - do_each_pid_task(pgrp, PIDTYPE_PGID, g) { - target = g; - while_each_thread(g, target) { - if (!security_capset_check(target, effective, - inheritable, - permitted)) { - security_capset_set(target, effective, - inheritable, - permitted); - ret = 0; - } - found = 1; - } - } while_each_pid_task(pgrp, PIDTYPE_PGID, g); - - if (!found) - ret = 0; - return ret; -} - -/* - * cap_set_all - set capabilities for all processes other than init - * and self. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_all(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - - do_each_thread(g, target) { - if (target == current || is_container_init(target->group_leader)) - continue; - found = 1; - if (security_capset_check(target, effective, inheritable, - permitted)) - continue; - ret = 0; - security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); - - if (!found) - ret = 0; - return ret; -} - /** - * sys_capset - set capabilities for a process or a group of processes + * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, @@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - struct task_struct *target; int ret; pid_t pid; @@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(pid, &header->pid)) return -EFAULT; - if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP)) - return -EPERM; - if (copy_from_user(&kdata, data, tocopy * sizeof(struct __user_cap_data_struct))) { return -EFAULT; @@ -344,40 +457,31 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) i++; } - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid > 0 && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = 0; - - /* having verified that the proposed changes are legal, - we now put them into effect. */ - if (pid < 0) { - if (pid == -1) /* all procs other than current and init */ - ret = cap_set_all(&effective, &inheritable, &permitted); + if (pid && (pid != task_pid_vnr(current))) + ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, + &permitted); + else { + /* + * This lock is required even when filesystem + * capability support is configured - it protects the + * sys_capget() call from returning incorrect data in + * the case that the targeted process is not the + * current one. + */ + spin_lock(&task_capability_lock); - else /* all procs in process group */ - ret = cap_set_pg(-pid, &effective, &inheritable, - &permitted); - } else { - ret = security_capset_check(target, &effective, &inheritable, + ret = security_capset_check(current, &effective, &inheritable, &permitted); + /* + * Having verified that the proposed changes are + * legal, we now put them into effect. + */ if (!ret) - security_capset_set(target, &effective, &inheritable, + security_capset_set(current, &effective, &inheritable, &permitted); + spin_unlock(&task_capability_lock); } -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); return ret; } -- cgit v1.1 From 0d63081d418c73cc187c893069e0f24c4c6eecd3 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 23 Jul 2008 21:28:32 -0700 Subject: swsusp: provide users with a hint about the no_console_suspend option Tell the user about the no_console_suspend option, so that we don't have to tell each bug reporter personally. [akpm@linux-foundation.org: clarify the text a little] Signed-off-by: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 07ad9e7..3f7a2a9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -933,7 +933,7 @@ void suspend_console(void) { if (!console_suspend_enabled) return; - printk("Suspending console(s)\n"); + printk("Suspending console(s) (use no_console_suspend to debug)\n"); acquire_console_sem(); console_suspended = 1; } -- cgit v1.1 From 77437fd4e61f87cc94d9314baa5cbf50e3ccdf54 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 23 Jul 2008 21:28:33 -0700 Subject: pm: boot time suspend selftest Boot-time test for system suspend states (STR or standby). The generic RTC framework triggers wakeup alarms, which are used to exit those states. - Measures some aspects of suspend time ... this uses "jiffies" until someone converts it to use a timebase that works properly even while timer IRQs are disabled. - Triggered by a command line parameter. By default nothing even vaguely troublesome will happen, but "test_suspend=mem" will give you a brief STR test during system boot. (Or you may need to use "test_suspend=standby" instead, if your hardware needs that.) This isn't without problems. It fires early enough during boot that for example both PCMCIA and MMC stacks have misbehaved. The workaround in those cases was to boot without such media cards inserted. [matthltc@us.ibm.com: fix compile failure in boot time suspend selftest] Signed-off-by: David Brownell Cc: Ingo Molnar Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 11 +++ kernel/power/main.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 204 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 59dfdf1..dcd165f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,6 +94,17 @@ config SUSPEND powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_LIB=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + config SUSPEND_FREEZER bool "Enable freezer for suspend to RAM/standby" \ if ARCH_WANTS_FREEZER_CONTROL || BROKEN diff --git a/kernel/power/main.c b/kernel/power/main.c index 3398f46..95bff23 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; } #ifdef CONFIG_SUSPEND +#ifdef CONFIG_PM_TEST_SUSPEND + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 5 + +static unsigned long suspend_test_start_time; + +static void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +static void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); +} + +#else + +static void suspend_test_start(void) +{ +} + +static void suspend_test_finish(const char *label) +{ +} + +#endif + /* This is just an arbitrary number */ #define FREE_PAGE_NUMBER (100) @@ -266,12 +321,13 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); goto Recover_platform; } - + suspend_test_finish("suspend devices"); if (suspend_test(TEST_DEVICES)) goto Recover_platform; @@ -293,7 +349,9 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: + suspend_test_start(); device_resume(PMSG_RESUME); + suspend_test_finish("resume devices"); resume_console(); Close: if (suspend_ops->end) @@ -521,3 +579,137 @@ static int __init pm_init(void) } core_initcall(pm_init); + + +#ifdef CONFIG_PM_TEST_SUSPEND + +#include + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, rtc->dev.bus_id, status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, rtc->dev.bus_id, status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(char **)name_ptr = dev->bus_id; + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + unsigned i; + + /* "=mem" ==> "mem" */ + value++; + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (!pm_states[i]) + continue; + if (strcmp(pm_states[i], value) != 0) + continue; + test_state = (__force suspend_state_t) i; + return 0; + } + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + char *pony = NULL; + struct rtc_device *rtc = NULL; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!valid_state(test_state)) { + printk(warn_bad_state, pm_states[test_state]); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + class_find_device(rtc_class, NULL, &pony, has_wakealarm); + if (pony) + rtc = rtc_class_open(pony); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); + +#endif /* CONFIG_PM_TEST_SUSPEND */ -- cgit v1.1 From 0d83304c7e7bd3b05be90281b3a47841bc8f057a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Jul 2008 21:28:38 -0700 Subject: pm: hibernation: simplify memory bitmap This patch simplifies the memory bitmap manipulations. - remove the member size in struct bm_block It is not necessary for struct bm_block to have the number of bit chunks that can be calculated by using end_pfn and start_pfn. - use find_next_bit() for memory_bm_next_pfn No need to invent the bitmap library only for the memory bitmap. Signed-off-by: Akinobu Mita Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 88 ++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5f91a07..5d2ab83 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that - * represent each blocks of bit chunks in which information is - * stored. + * represent each blocks of bitmap in which information is stored. * * struct memory_bitmap contains a pointer to the main list of zone * bitmap objects, a struct bm_position used for browsing the bitmap, @@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * pfns that correspond to the start and end of the represented zone. * * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bit chunks - * of type unsigned long each). It also contains the pfns that - * correspond to the start and end of the represented memory area and - * the number of bit chunks in the block. + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. */ #define BM_END_OF_MAP (~0UL) -#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) -#define BM_BITS_PER_CHUNK (sizeof(long) << 3) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { struct bm_block *next; /* next element of the list */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned int size; /* number of bit chunks */ - unsigned long *data; /* chunks of bits representing pages */ + unsigned long *data; /* bitmap representing pages */ }; +static inline unsigned long bm_block_bits(struct bm_block *bb) +{ + return bb->end_pfn - bb->start_pfn; +} + struct zone_bitmap { struct zone_bitmap *next; /* next element of the list */ unsigned long start_pfn; /* minimal pfn in this zone */ @@ -257,7 +257,6 @@ struct zone_bitmap { struct bm_position { struct zone_bitmap *zone_bm; struct bm_block *block; - int chunk; int bit; }; @@ -272,12 +271,6 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) -{ - bm->cur.chunk = 0; - bm->cur.bit = -1; -} - static void memory_bm_position_reset(struct memory_bitmap *bm) { struct zone_bitmap *zone_bm; @@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) zone_bm = bm->zone_bm_list; bm->cur.zone_bm = zone_bm; bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); @@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) bb->start_pfn = pfn; if (nr >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - bb->size = BM_CHUNKS_PER_BLOCK; nr -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ pfn += nr; - bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); } bb->end_pfn = pfn; bb = bb->next; @@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, } zone_bm->cur_block = bb; pfn -= bb->start_pfn; - *bit_nr = pfn % BM_BITS_PER_CHUNK; - *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + *bit_nr = pfn; + *addr = bb->data; return 0; } @@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } -/* Two auxiliary functions for memory_bm_next_pfn */ - -/* Find the first set bit in the given chunk, if there is one */ - -static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) -{ - bit++; - while (bit < BM_BITS_PER_CHUNK) { - if (test_bit(bit, chunk_p)) - return bit; - - bit++; - } - return -1; -} - -/* Find a chunk containing some bits set in given block of bits */ - -static inline int next_chunk_in_block(int n, struct bm_block *bb) -{ - n++; - while (n < bb->size) { - if (bb->data[n]) - return n; - - n++; - } - return -1; -} - /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { struct zone_bitmap *zone_bm; struct bm_block *bb; - int chunk; int bit; do { bb = bm->cur.block; do { - chunk = bm->cur.chunk; bit = bm->cur.bit; - do { - bit = next_bit_in_chunk(bit, bb->data + chunk); - if (bit >= 0) - goto Return_pfn; - - chunk = next_chunk_in_block(chunk, bb); - bit = -1; - } while (chunk >= 0); + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + bb = bb->next; bm->cur.block = bb; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } while (bb); zone_bm = bm->cur.zone_bm->next; if (zone_bm) { bm->cur.zone_bm = zone_bm; bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } } while (zone_bm); memory_bm_position_reset(bm); return BM_END_OF_MAP; Return_pfn: - bm->cur.chunk = chunk; - bm->cur.bit = bit; - return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; + bm->cur.bit = bit + 1; + return bb->start_pfn + bit; } /** -- cgit v1.1 From c1a220e7acf8ad2c03504891f4a70cd9c32c904b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 23 Jul 2008 21:28:39 -0700 Subject: pm: introduce new interfaces schedule_work_on() and queue_work_on() This interface allows adding a job on a specific cpu. Although a work struct on a cpu will be scheduled to other cpu if the cpu dies, there is a recursion if a work task tries to offline the cpu it's running on. we need to schedule the task to a specific cpu in this case. http://bugzilla.kernel.org/show_bug.cgi?id=10897 [oleg@tv-sign.ru: cleanups] Signed-off-by: Zhang Rui Tested-by: Rus Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a6d3634..6fd158b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -140,7 +140,6 @@ static void insert_work(struct cpu_workqueue_struct *cwq, wake_up(&cwq->more_work); } -/* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) { @@ -175,6 +174,31 @@ int queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ +int +queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +{ + int ret = 0; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + BUG_ON(!list_empty(&work->entry)); + __queue_work(wq_per_cpu(wq, cpu), work); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_on); + static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; @@ -553,6 +577,19 @@ int schedule_work(struct work_struct *work) } EXPORT_SYMBOL(schedule_work); +/* + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +int schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, keventd_wq, work); +} +EXPORT_SYMBOL(schedule_work_on); + /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done -- cgit v1.1 From 2f15fc4bdf91eb399da3f47a09c55831d9f22826 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 23 Jul 2008 21:28:40 -0700 Subject: pm: schedule sysrq poweroff on boot cpu schedule sysrq poweroff on boot cpu. sysrq poweroff needs to disable nonboot cpus, and we need to run this on boot cpu to avoid any recursion. http://bugzilla.kernel.org/show_bug.cgi?id=10897 [kosaki.motohiro@jp.fujitsu.com: build fix] Signed-off-by: Zhang Rui Tested-by: Rus Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/poweroff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 678ec73..72016f0 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -10,6 +10,7 @@ #include #include #include +#include /* * When the user hits Sys-Rq o to power down the machine this is the @@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { - schedule_work(&poweroff_work); + /* run sysrq poweroff on boot cpu */ + schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { -- cgit v1.1 From f0af566da6e9a4a2f5a83c5a70f3d0a772050e21 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 23 Jul 2008 21:28:44 -0700 Subject: pm: fix try_to_freeze_tasks()'s use of do_div() Fix try_to_freeze_tasks()'s use of do_div() on an s64 by making elapsed_csecs64 a u64 instead and dividing that. Possibly this should be guarded lest the interval calculation turn up negative, but the possible negativity of the result of the division is cast away anyway. This was introduced by patch 438e2ce68dfd4af4cfcec2f873564fb921db4bb5. Signed-off-by: David Howells Acked-by: "Rafael J. Wysocki" Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 5fb8765..278946a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -149,7 +149,7 @@ static int try_to_freeze_tasks(bool sig_only) unsigned long end_time; unsigned int todo; struct timeval start, end; - s64 elapsed_csecs64; + u64 elapsed_csecs64; unsigned int elapsed_csecs; do_gettimeofday(&start); -- cgit v1.1 From 82736f4d1d2b7063b829cc93171a6e5aea8a9c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 23 Jul 2008 21:28:54 -0700 Subject: generic irqs: handle failure of irqchip->set_type in setup_irq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit set_type returns an int indicating success or failure, but up to now setup_irq ignores that. In my case this resulted in a machine hang: gpio-keys requested IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, but arm/ns9xxx can only trigger on one direction so set_type didn't touch the configuration which happens do default to a level sensitiveness and returned -EINVAL. setup_irq ignored that and unmasked the irq. This resulted in an endless triggering of the gpio-key interrupt service routine which effectively killed the machine. With this patch applied setup_irq propagates the error to the caller. Note that before in the case chip && !chip->set_type && !chip->name a NULL pointer was feed to printk. This is fixed, too. Signed-off-by: Uwe Kleine-König Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 64 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3cfc0fe..5bc6e5e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -308,6 +308,30 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) desc->handle_irq = NULL; } +static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, + unsigned long flags) +{ + int ret; + + if (!chip || !chip->set_type) { + /* + * IRQF_TRIGGER_* but the PIC does not support multiple + * flow-types? + */ + pr_warning("No set_type function for IRQ %d (%s)\n", irq, + chip ? (chip->name ? : "unknown") : "unknown"); + return 0; + } + + ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); + + if (ret) + pr_err("setting flow type for irq %u failed (%pF)\n", + irq, chip->set_type); + + return ret; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -319,6 +343,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) const char *old_name = NULL; unsigned long flags; int shared = 0; + int ret; if (irq >= NR_IRQS) return -EINVAL; @@ -376,35 +401,23 @@ int setup_irq(unsigned int irq, struct irqaction *new) shared = 1; } - *p = new; - - /* Exclude IRQ from balancing */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; - if (!shared) { irq_chip_set_defaults(desc->chip); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif - /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - if (desc->chip->set_type) - desc->chip->set_type(irq, - new->flags & IRQF_TRIGGER_MASK); - else - /* - * IRQF_TRIGGER_* but the PIC does not support - * multiple flow-types? - */ - printk(KERN_WARNING "No IRQF_TRIGGER set_type " - "function for IRQ %d (%s)\n", irq, - desc->chip->name); + ret = __irq_set_trigger(desc->chip, irq, new->flags); + + if (ret) { + spin_unlock_irqrestore(&desc->lock, flags); + return ret; + } } else compat_irq_chip_set_default_handler(desc); +#if defined(CONFIG_IRQ_PER_CPU) + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; +#endif desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); @@ -423,6 +436,13 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Set default affinity mask once everything is setup */ irq_select_affinity(irq); } + + *p = new; + + /* Exclude IRQ from balancing */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; -- cgit v1.1 From aaca0bdca573f3f51ea03139f9c7289541e7bca3 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:20 -0700 Subject: flag parameters: paccept This patch is by far the most complex in the series. It adds a new syscall paccept. This syscall differs from accept in that it adds (at the userlevel) two additional parameters: - a signal mask - a flags value The flags parameter can be used to set flag like SOCK_CLOEXEC. This is imlpemented here as well. Some people argued that this is a property which should be inherited from the file desriptor for the server but this is against POSIX. Additionally, we really want the signal mask parameter as well (similar to pselect, ppoll, etc). So an interface change in inevitable. The flag value is the same as for socket and socketpair. I think diverging here will only create confusion. Similar to the filesystem interfaces where the use of the O_* constants differs, it is acceptable here. The signal mask is handled as for pselect etc. The mask is temporarily installed for the thread and removed before the call returns. I modeled the code after pselect. If there is a problem it's likely also in pselect. For architectures which use socketcall I maintained this interface instead of adding a system call. The symmetry shouldn't be broken. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #include #include #include #include #ifndef __NR_paccept # ifdef __x86_64__ # define __NR_paccept 288 # elif defined __i386__ # define SYS_PACCEPT 18 # define USE_SOCKETCALL 1 # else # error "need __NR_paccept" # endif #endif #ifdef USE_SOCKETCALL # define paccept(fd, addr, addrlen, mask, flags) \ ({ long args[6] = { \ (long) fd, (long) addr, (long) addrlen, (long) mask, 8, (long) flags }; \ syscall (__NR_socketcall, SYS_PACCEPT, args); }) #else # define paccept(fd, addr, addrlen, mask, flags) \ syscall (__NR_paccept, fd, addr, addrlen, mask, 8, flags) #endif #define PORT 57392 #define SOCK_CLOEXEC O_CLOEXEC static pthread_barrier_t b; static void * tf (void *arg) { pthread_barrier_wait (&b); int s = socket (AF_INET, SOCK_STREAM, 0); struct sockaddr_in sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK); sin.sin_port = htons (PORT); connect (s, (const struct sockaddr *) &sin, sizeof (sin)); close (s); pthread_barrier_wait (&b); s = socket (AF_INET, SOCK_STREAM, 0); sin.sin_port = htons (PORT); connect (s, (const struct sockaddr *) &sin, sizeof (sin)); close (s); pthread_barrier_wait (&b); pthread_barrier_wait (&b); sleep (2); pthread_kill ((pthread_t) arg, SIGUSR1); return NULL; } static void handler (int s) { } int main (void) { pthread_barrier_init (&b, NULL, 2); struct sockaddr_in sin; pthread_t th; if (pthread_create (&th, NULL, tf, (void *) pthread_self ()) != 0) { puts ("pthread_create failed"); return 1; } int s = socket (AF_INET, SOCK_STREAM, 0); int reuse = 1; setsockopt (s, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof (reuse)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK); sin.sin_port = htons (PORT); bind (s, (struct sockaddr *) &sin, sizeof (sin)); listen (s, SOMAXCONN); pthread_barrier_wait (&b); int s2 = paccept (s, NULL, 0, NULL, 0); if (s2 < 0) { puts ("paccept(0) failed"); return 1; } int coe = fcntl (s2, F_GETFD); if (coe & FD_CLOEXEC) { puts ("paccept(0) set close-on-exec-flag"); return 1; } close (s2); pthread_barrier_wait (&b); s2 = paccept (s, NULL, 0, NULL, SOCK_CLOEXEC); if (s2 < 0) { puts ("paccept(SOCK_CLOEXEC) failed"); return 1; } coe = fcntl (s2, F_GETFD); if ((coe & FD_CLOEXEC) == 0) { puts ("paccept(SOCK_CLOEXEC) does not set close-on-exec flag"); return 1; } close (s2); pthread_barrier_wait (&b); struct sigaction sa; sa.sa_handler = handler; sa.sa_flags = 0; sigemptyset (&sa.sa_mask); sigaction (SIGUSR1, &sa, NULL); sigset_t ss; pthread_sigmask (SIG_SETMASK, NULL, &ss); sigaddset (&ss, SIGUSR1); pthread_sigmask (SIG_SETMASK, &ss, NULL); sigdelset (&ss, SIGUSR1); alarm (4); pthread_barrier_wait (&b); errno = 0 ; s2 = paccept (s, NULL, 0, &ss, 0); if (s2 != -1 || errno != EINTR) { puts ("paccept did not fail with EINTR"); return 1; } close (s); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: make it compile] [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Cc: "David S. Miller" Cc: Roland McGrath Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0fea0ee..2f0b8a2 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -31,6 +31,7 @@ cond_syscall(sys_socketpair); cond_syscall(sys_bind); cond_syscall(sys_listen); cond_syscall(sys_accept); +cond_syscall(sys_paccept); cond_syscall(sys_connect); cond_syscall(sys_getsockname); cond_syscall(sys_getpeername); -- cgit v1.1 From 9deb27baedb79759c3ab9435a7d8b841842d56e9 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:24 -0700 Subject: flag parameters: signalfd This patch adds the new signalfd4 syscall. It extends the old signalfd syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is SFD_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name SFD_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_signalfd4 # ifdef __x86_64__ # define __NR_signalfd4 289 # elif defined __i386__ # define __NR_signalfd4 327 # else # error "need __NR_signalfd4" # endif #endif #define SFD_CLOEXEC O_CLOEXEC int main (void) { sigset_t ss; sigemptyset (&ss); sigaddset (&ss, SIGUSR1); int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0); if (fd == -1) { puts ("signalfd4(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("signalfd4(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC); if (fd == -1) { puts ("signalfd4(SFD_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2f0b8a2..8627c89 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -156,6 +156,7 @@ cond_syscall(sys_ioprio_get); /* New file descriptors */ cond_syscall(sys_signalfd); +cond_syscall(sys_signalfd4); cond_syscall(compat_sys_signalfd); cond_syscall(sys_timerfd_create); cond_syscall(sys_timerfd_settime); -- cgit v1.1 From b087498eb5605673b0f260a7620d91818cd72304 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:25 -0700 Subject: flag parameters: eventfd This patch adds the new eventfd2 syscall. It extends the old eventfd syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is EFD_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name EFD_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_eventfd2 # ifdef __x86_64__ # define __NR_eventfd2 290 # elif defined __i386__ # define __NR_eventfd2 328 # else # error "need __NR_eventfd2" # endif #endif #define EFD_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_eventfd2, 1, 0); if (fd == -1) { puts ("eventfd2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("eventfd2(0) sets close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC); if (fd == -1) { puts ("eventfd2(EFD_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8627c89..2a361cc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -164,3 +164,4 @@ cond_syscall(sys_timerfd_gettime); cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); +cond_syscall(sys_eventfd2); -- cgit v1.1 From 4006553b06306b34054529477b06b68a1c66249b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:32 -0700 Subject: flag parameters: inotify_init This patch introduces the new syscall inotify_init1 (note: the 1 stands for the one parameter the syscall takes, as opposed to no parameter before). The values accepted for this parameter are function-specific and defined in the inotify.h header. Here the values must match the O_* flags, though. In this patch CLOEXEC support is introduced. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_inotify_init1 # ifdef __x86_64__ # define __NR_inotify_init1 294 # elif defined __i386__ # define __NR_inotify_init1 332 # else # error "need __NR_inotify_init1" # endif #endif #define IN_CLOEXEC O_CLOEXEC int main (void) { int fd; fd = syscall (__NR_inotify_init1, 0); if (fd == -1) { puts ("inotify_init1(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("inotify_init1(0) set close-on-exit"); return 1; } close (fd); fd = syscall (__NR_inotify_init1, IN_CLOEXEC); if (fd == -1) { puts ("inotify_init1(IN_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2a361cc..bd66ac5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -96,6 +96,7 @@ cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); cond_syscall(sys_inotify_init); +cond_syscall(sys_inotify_init1); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); -- cgit v1.1 From be61a86d7237dd80510615f38ae21d6e1e98660c Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:40 -0700 Subject: flag parameters: NONBLOCK in pipe This patch adds O_NONBLOCK support to pipe2. It is minimally more involved than the patches for eventfd et.al but still trivial. The interfaces of the create_write_pipe and create_read_pipe helper functions were changed and the one other caller as well. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_pipe2 # ifdef __x86_64__ # define __NR_pipe2 293 # elif defined __i386__ # define __NR_pipe2 331 # else # error "need __NR_pipe2" # endif #endif int main (void) { int fds[2]; if (syscall (__NR_pipe2, fds, 0) == -1) { puts ("pipe2(0) failed"); return 1; } for (int i = 0; i < 2; ++i) { int fl = fcntl (fds[i], F_GETFL); if (fl == -1) { puts ("fcntl failed"); return 1; } if (fl & O_NONBLOCK) { printf ("pipe2(0) set non-blocking mode for fds[%d]\n", i); return 1; } close (fds[i]); } if (syscall (__NR_pipe2, fds, O_NONBLOCK) == -1) { puts ("pipe2(O_NONBLOCK) failed"); return 1; } for (int i = 0; i < 2; ++i) { int fl = fcntl (fds[i], F_GETFL); if (fl == -1) { puts ("fcntl failed"); return 1; } if ((fl & O_NONBLOCK) == 0) { printf ("pipe2(O_NONBLOCK) does not set non-blocking mode for fds[%d]\n", i); return 1; } close (fds[i]); } puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 90d7af1..2989f67 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -417,12 +417,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, { struct file *f; - f = create_write_pipe(); + f = create_write_pipe(0); if (IS_ERR(f)) return PTR_ERR(f); *filp = f; - f = create_read_pipe(f); + f = create_read_pipe(f, 0); if (IS_ERR(f)) { free_write_pipe(*filp); return PTR_ERR(f); -- cgit v1.1