From bd95b88d9e51fcbf392a7e90338a8fcc3499cbd6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Oct 2008 09:31:27 -0400 Subject: ftrace: release functions from hash The x86 architecture uses a static recording of mcount caller locations and is not affected by this patch. For architectures still using the dynamic ftrace daemon, this patch is critical. It removes the race between the recording of a function that calls mcount, the unloading of a module, and the ftrace daemon updating the call sites. This patch adds the releasing of the hash functions that the daemon uses to update the mcount call sites. When a module is unloaded, not only are the replaced call site table update, but now so is the hash recorded functions that the ftrace daemon will use. Again, architectures that implement MCOUNT_RECORD are not affected by this (which currently only x86 has). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4dda4f6..1f54a94 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -164,10 +164,14 @@ static DEFINE_SPINLOCK(ftrace_hash_lock); #define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) #define ftrace_hash_unlock(flags) \ spin_unlock_irqrestore(&ftrace_hash_lock, flags) +static void ftrace_release_hash(unsigned long start, unsigned long end); #else /* This is protected via the ftrace_lock with MCOUNT_RECORD. */ #define ftrace_hash_lock(flags) do { (void)(flags); } while (0) #define ftrace_hash_unlock(flags) do { } while(0) +static inline void ftrace_release_hash(unsigned long start, unsigned long end) +{ +} #endif /* @@ -347,6 +351,7 @@ void ftrace_release(void *start, unsigned long size) } spin_unlock(&ftrace_lock); + ftrace_release_hash(s, e); } static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) @@ -1659,6 +1664,44 @@ void __init ftrace_init(void) ftrace_disabled = 1; } #else /* CONFIG_FTRACE_MCOUNT_RECORD */ + +static void ftrace_release_hash(unsigned long start, unsigned long end) +{ + struct dyn_ftrace *rec; + struct hlist_node *t, *n; + struct hlist_head *head, temp_list; + unsigned long flags; + int i, cpu; + + preempt_disable_notrace(); + + /* disable incase we call something that calls mcount */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + + ftrace_hash_lock(flags); + + for (i = 0; i < FTRACE_HASHSIZE; i++) { + INIT_HLIST_HEAD(&temp_list); + head = &ftrace_hash[i]; + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry_safe(rec, t, n, head, node) { + if (rec->flags & FTRACE_FL_FREE) + continue; + + if ((rec->ip >= start) && (rec->ip < end)) + ftrace_free_rec(rec); + } + } + + ftrace_hash_unlock(flags); + + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + preempt_enable_notrace(); + +} + static int ftraced(void *ignore) { unsigned long usecs; -- cgit v1.1 From c2db8054c1eaf99983d8deee347876b01c26c2cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:11 -0400 Subject: ftrace: fix depends A lot of tracers have HAVE_FTRACE as a dependent config where it really should not. The HAVE_FTRACE is a misnomer (soon to be fixed) and describes if the architecture has the function tracer (mcount) implemented. The ftrace infrastructure is implemented in all archs. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1cb3e1f..5866edb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,7 +49,6 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING @@ -73,7 +72,6 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE @@ -101,7 +99,6 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER @@ -112,7 +109,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING select MARKERS @@ -122,7 +118,6 @@ config CONTEXT_SWITCH_TRACER config BOOT_TRACER bool "Trace boot initcalls" - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING help -- cgit v1.1 From 606576ce816603d9fe1fb453a88bc6eea16ca709 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:12 -0400 Subject: ftrace: rename FTRACE to FUNCTION_TRACER Due to confusion between the ftrace infrastructure and the gcc profiling tracer "ftrace", this patch renames the config options from FTRACE to FUNCTION_TRACER. The other two names that are offspring from FTRACE DYNAMIC_FTRACE and FTRACE_MCOUNT_RECORD will stay the same. This patch was generated mostly by script, and partially by hand. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/Makefile | 4 ++-- kernel/sysctl.c | 2 +- kernel/trace/Kconfig | 17 +++++++++-------- kernel/trace/Makefile | 6 +++--- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_irqsoff.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 4 ++-- kernel/trace/trace_selftest.c | 4 ++-- 9 files changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 8f9ce7e..85f588a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,7 +13,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ CFLAGS_REMOVE_sched.o = -mno-spe -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg CFLAGS_REMOVE_lockdep_proc.o = -pg @@ -86,7 +86,7 @@ obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o -obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e..619eb9f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -464,7 +464,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER { .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5866edb..3533c58 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,11 +1,12 @@ # -# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: # config NOP_TRACER bool -config HAVE_FTRACE +config HAVE_FUNCTION_TRACER bool select NOP_TRACER @@ -28,9 +29,9 @@ config TRACING select STACKTRACE select TRACEPOINTS -config FTRACE +config FUNCTION_TRACER bool "Kernel Function Tracer" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL select FRAME_POINTER select TRACING @@ -136,9 +137,9 @@ config BOOT_TRACER config STACK_TRACER bool "Trace max stack" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL - select FTRACE + select FUNCTION_TRACER select STACKTRACE help This special tracer records the maximum stack footprint of the @@ -155,7 +156,7 @@ config STACK_TRACER config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" - depends on FTRACE + depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE depends on DEBUG_KERNEL default y @@ -165,7 +166,7 @@ config DYNAMIC_FTRACE with a No-Op instruction) as they are called. A table is created to dynamically enable them again. - This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. The changes to the code are done by a kernel thread that diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index a85dfba..c8228b1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,7 +1,7 @@ # Do not instrument the tracer itself: -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) @@ -10,13 +10,13 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif -obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o -obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d345d64..aeb2f25 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -851,7 +851,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) preempt_enable_notrace(); } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER static void function_trace_call(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f1f9957..6889ca4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -335,7 +335,7 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER void tracing_start_function_trace(void); void tracing_stop_function_trace(void); #else diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a7db7f0..9c74071 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) */ static __cacheline_aligned_in_smp unsigned long max_sequence; -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fe4a252..3ae93f1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock = static void __wakeup_reset(struct trace_array *tr); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -96,7 +96,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 09cf230..95815d2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -70,7 +70,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -226,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) return ret; } -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_IRQSOFF_TRACER int -- cgit v1.1 From 3ce83aea86bf46fd1bff59d2e6d16f48fdce22fc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:13 -0400 Subject: ftrace: rename the ftrace tracer to function To avoid further confusion between the ftrace infrastructure and the function tracer. This patch renames the "ftrace" function tracer to "function". Now in available_tracers, instead of "ftrace" there will be "function". This makes more sense, since people will not know exactly what the "ftrace" tracer does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e90eb0c..0f85a64 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -64,7 +64,7 @@ static void function_trace_ctrl_update(struct trace_array *tr) static struct tracer function_trace __read_mostly = { - .name = "ftrace", + .name = "function", .init = function_trace_init, .reset = function_trace_reset, .ctrl_update = function_trace_ctrl_update, -- cgit v1.1 From 81520a1b0649d0701205b818714a8c1e1cfbbb5b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 21:24:18 -0400 Subject: ftrace: stack tracer only record when on stack The stack trace API does not record if the stack is not on the current task's stack. That is, if the stack is the interrupt stack or NMI stack, the output does not show. Also, the size of those stacks are not consistent with the size of the thread stack, this makes the calculation of the stack size usually bogus. This all confuses the stack tracer. I unfortunately do not have time to fix all these problems, but this patch does record the worst stack when the stack pointer is on the tasks stack (instead of bogus numbers). The patch simply returns if the stack pointer is not on the task's stack. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 74c5d9a..be682b6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -44,6 +44,10 @@ static inline void check_stack(void) if (this_size <= max_stack_size) return; + /* we do not handle interrupt stacks yet */ + if (!object_is_on_stack(&this_size)) + return; + raw_local_irq_save(flags); __raw_spin_lock(&max_stack_lock); -- cgit v1.1 From 3786fc710c32b61464c322e5cd0c3d1d34ae72d0 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Oct 2008 19:49:09 -0400 Subject: irq: make variable static This variable is only used in the source file, so make it static. Signed-off-by: Roel Kluin Signed-off-by: Ingo Molnar --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index fac014a..4d161c7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -220,7 +220,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) } } -void register_default_affinity_proc(void) +static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP proc_create("irq/default_smp_affinity", 0600, NULL, -- cgit v1.1 From 17d80fd07d35ae1d231b3378ee4f00ace54f9d31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Oct 2008 16:31:18 +0200 Subject: tracing: create tracers menu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We seem to have plenty tracers, lets create a menu and not clutter the already cluttered debug menu more. Signed-off-by: Peter Zijlstra Acked-by: Frédéric Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3533c58..bc535cb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -29,6 +29,8 @@ config TRACING select STACKTRACE select TRACEPOINTS +menu "Tracers" + config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER @@ -191,3 +193,5 @@ config FTRACE_STARTUP_TEST a series of tests are made to verify that the tracer is functioning properly. It will do tests on all the configured tracers of ftrace. + +endmenu -- cgit v1.1 From 4ce72a2c063a7fa8e42a9435440ae3364115a58d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 22 Oct 2008 15:25:26 +0800 Subject: sched: add CONFIG_SMP consistency a patch from Henrik Austad did this: >> Do not declare select_task_rq as part of sched_class when CONFIG_SMP is >> not set. Peter observed: > While a proper cleanup, could you do it by re-arranging the methods so > as to not create an additional ifdef? Do not declare select_task_rq and some other methods as part of sched_class when CONFIG_SMP is not set. Also gather those methods to avoid CONFIG_SMP mess. Idea-by: Henrik Austad Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Acked-by: Henrik Austad Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++--- kernel/sched_idletask.c | 5 ++--- kernel/sched_rt.c | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a0aa38b..8de48a5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1593,9 +1593,6 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_fair, -#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_wakeup, @@ -1603,6 +1600,8 @@ static const struct sched_class fair_sched_class = { .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, + .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, #endif diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index dec4cca..8a21a2e 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = { /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, -#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_idle, @@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = { .put_prev_task = put_prev_task_idle, #ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, + .load_balance = load_balance_idle, .move_one_task = move_one_task_idle, #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index cdf5740..c9aa5be 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1502,9 +1502,6 @@ static const struct sched_class rt_sched_class = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_rt, -#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_rt, @@ -1512,6 +1509,8 @@ static const struct sched_class rt_sched_class = { .put_prev_task = put_prev_task_rt, #ifdef CONFIG_SMP + .select_task_rq = select_task_rq_rt, + .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, -- cgit v1.1 From 6ae2a0765ab764da11cc305058ee5333810228f4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Oct 2008 10:22:06 -0400 Subject: ring-buffer: fix free page The pages of a buffer was originally pointing to the page struct, it now points to the page address. The freeing of the page still uses the page frame free "__free_page" instead of the correct free_page to the address. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 94af1fe..091aeef 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -130,7 +130,7 @@ struct buffer_page { static inline void free_buffer_page(struct buffer_page *bpage) { if (bpage->page) - __free_page(bpage->page); + free_page((unsigned long)bpage->page); kfree(bpage); } -- cgit v1.1 From 593eb8a2d63e95772a5f22d746f18a997c5ee463 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:32:59 -0400 Subject: ftrace: return error on failed modified text. Have the ftrace_modify_code return error values: -EFAULT on error of reading the address -EINVAL if what is read does not match what it expected -EPERM if the write fails to update after a successful match. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1f54a94..b2de8de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -596,22 +596,22 @@ ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; unsigned char *nop, *call; - int failed; + int ret; ip = rec->ip; nop = ftrace_nop_replace(); call = ftrace_call_replace(ip, mcount_addr); - failed = ftrace_modify_code(ip, call, nop); - if (failed) { - switch (failed) { - case 1: + ret = ftrace_modify_code(ip, call, nop); + if (ret) { + switch (ret) { + case -EFAULT: WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; - case 2: + case -EINVAL: WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); @@ -620,6 +620,15 @@ ftrace_code_disable(struct dyn_ftrace *rec) print_ip_ins(" replace: ", nop); printk(KERN_CONT "\n"); break; + case -EPERM: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); } rec->flags |= FTRACE_FL_FAILED; -- cgit v1.1 From 81adbdc029ecc416d56563e7f159100181dd711d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:02 -0400 Subject: ftrace: only have ftrace_kill atomic When an anomaly is detected, we need a way to completely disable ftrace. Right now we have two functions: ftrace_kill and ftrace_kill_atomic. The ftrace_kill tries to do it in a "nice" way by converting everything back to a nop. The "nice" way is dangerous itself, so this patch removes it and only has the "atomic" version, which is all that is needed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 42 ++---------------------------------------- kernel/trace/trace.c | 2 +- 2 files changed, 3 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b2de8de..93245ae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1549,22 +1549,6 @@ int ftrace_force_update(void) return ret; } -static void ftrace_force_shutdown(void) -{ - struct task_struct *task; - int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; - - mutex_lock(&ftraced_lock); - task = ftraced_task; - ftraced_task = NULL; - ftraced_suspend = -1; - ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); - - if (task) - kthread_stop(task); -} - static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1795,17 +1779,16 @@ core_initcall(ftrace_dynamic_init); # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) -# define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** - * ftrace_kill_atomic - kill ftrace from critical sections + * ftrace_kill - kill ftrace * * This function should be used by panic code. It stops ftrace * but in a not so nice way. If you need to simply kill ftrace * from a non-atomic section, use ftrace_kill. */ -void ftrace_kill_atomic(void) +void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; @@ -1816,27 +1799,6 @@ void ftrace_kill_atomic(void) } /** - * ftrace_kill - totally shutdown ftrace - * - * This is a safety measure. If something was detected that seems - * wrong, calling this function will keep ftrace from doing - * any more modifications, and updates. - * used when something went wrong. - */ -void ftrace_kill(void) -{ - mutex_lock(&ftrace_sysctl_lock); - ftrace_disabled = 1; - ftrace_enabled = 0; - - clear_ftrace_function(); - mutex_unlock(&ftrace_sysctl_lock); - - /* Try to totally disable ftrace */ - ftrace_force_shutdown(); -} - -/** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aeb2f25..333a516 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3097,7 +3097,7 @@ void ftrace_dump(void) dump_ran = 1; /* No turning back! */ - ftrace_kill_atomic(); + ftrace_kill(); for_each_tracing_cpu(cpu) { atomic_inc(&global_trace.data[cpu]->disabled); -- cgit v1.1 From 6912896e994ddaf06cc0f6d3f2098bc4b59bdd84 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:03 -0400 Subject: ftrace: add ftrace warn on to disable ftrace Add ftrace warn on to disable ftrace as well as report a warning. [ Thanks to Andrew Morton for suggesting using the WARN_ON return value ] Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 93245ae..b08996c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,6 +32,18 @@ #include "trace.h" +#define FTRACE_WARN_ON(cond) \ + do { \ + if (WARN_ON(cond)) \ + ftrace_kill(); \ + } while (0) + +#define FTRACE_WARN_ON_ONCE(cond) \ + do { \ + if (WARN_ON_ONCE(cond)) \ + ftrace_kill(); \ + } while (0) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -363,10 +375,8 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) rec = ftrace_free_records; if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); ftrace_free_records = NULL; - ftrace_disabled = 1; - ftrace_enabled = 0; return NULL; } @@ -415,7 +425,7 @@ ftrace_record_ip(unsigned long ip) key = hash_long(ip, FTRACE_HASHBITS); - WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + FTRACE_WARN_ON_ONCE(key >= FTRACE_HASHSIZE); if (ftrace_ip_in_hash(ip, key)) goto out; @@ -607,12 +617,12 @@ ftrace_code_disable(struct dyn_ftrace *rec) if (ret) { switch (ret) { case -EFAULT: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; case -EINVAL: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" expected: ", call); @@ -621,12 +631,12 @@ ftrace_code_disable(struct dyn_ftrace *rec) printk(KERN_CONT "\n"); break; case -EPERM: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); print_ip_sym(ip); break; default: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } @@ -1722,8 +1732,7 @@ static int ftraced(void *ignore) ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); } } mutex_unlock(&ftraced_lock); -- cgit v1.1 From cb7be3b2fc2cf089ee52b16f0fd9ebb29e9944e1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:05 -0400 Subject: ftrace: remove daemon The ftrace daemon is complex and error prone. This patch strips it out of the code. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 322 ++++-------------------------------------- kernel/trace/trace_selftest.c | 14 -- 2 files changed, 28 insertions(+), 308 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b08996c..e758cab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -165,25 +165,8 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } #ifdef CONFIG_DYNAMIC_FTRACE - #ifndef CONFIG_FTRACE_MCOUNT_RECORD -/* - * The hash lock is only needed when the recording of the mcount - * callers are dynamic. That is, by the caller themselves and - * not recorded via the compilation. - */ -static DEFINE_SPINLOCK(ftrace_hash_lock); -#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) -#define ftrace_hash_unlock(flags) \ - spin_unlock_irqrestore(&ftrace_hash_lock, flags) -static void ftrace_release_hash(unsigned long start, unsigned long end); -#else -/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ -#define ftrace_hash_lock(flags) do { (void)(flags); } while (0) -#define ftrace_hash_unlock(flags) do { } while(0) -static inline void ftrace_release_hash(unsigned long start, unsigned long end) -{ -} +# error Dynamic ftrace depends on MCOUNT_RECORD #endif /* @@ -194,8 +177,6 @@ static inline void ftrace_release_hash(unsigned long start, unsigned long end) */ static unsigned long mcount_addr = MCOUNT_ADDR; -static struct task_struct *ftraced_task; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -212,7 +193,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_MUTEX(ftraced_lock); static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { @@ -230,10 +210,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static int ftraced_trigger; -static int ftraced_suspend; -static int ftraced_stop; - static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; @@ -398,7 +374,6 @@ static void ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; - unsigned long flags; unsigned long key; int resched; int cpu; @@ -430,24 +405,18 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out; - ftrace_hash_lock(flags); - /* This ip may have hit the hash before the lock */ if (ftrace_ip_in_hash(ip, key)) - goto out_unlock; + goto out; node = ftrace_alloc_dyn_node(ip); if (!node) - goto out_unlock; + goto out; node->ip = ip; ftrace_add_hash(node, key); - ftraced_trigger = 1; - - out_unlock: - ftrace_hash_unlock(flags); out: per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -647,7 +616,7 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } -static int __ftrace_update_code(void *ignore); +static int ftrace_update_code(void *ignore); static int __ftrace_modify_code(void *data) { @@ -659,7 +628,7 @@ static int __ftrace_modify_code(void *data) * Update any recorded ips now that we have the * machine stopped */ - __ftrace_update_code(NULL); + ftrace_update_code(NULL); ftrace_replace_code(1); tracing_on = 1; } else if (*command & FTRACE_DISABLE_CALLS) { @@ -686,26 +655,9 @@ static void ftrace_run_update_code(int command) stop_machine(__ftrace_modify_code, &command, NULL); } -void ftrace_disable_daemon(void) -{ - /* Stop the daemon from calling kstop_machine */ - mutex_lock(&ftraced_lock); - ftraced_stop = 1; - mutex_unlock(&ftraced_lock); - - ftrace_force_update(); -} - -void ftrace_enable_daemon(void) -{ - mutex_lock(&ftraced_lock); - ftraced_stop = 0; - mutex_unlock(&ftraced_lock); - - ftrace_force_update(); -} - static ftrace_func_t saved_ftrace_func; +static int ftrace_start; +static DEFINE_MUTEX(ftrace_start_lock); static void ftrace_startup(void) { @@ -714,9 +666,9 @@ static void ftrace_startup(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); - ftraced_suspend++; - if (ftraced_suspend == 1) + mutex_lock(&ftrace_start_lock); + ftrace_start++; + if (ftrace_start == 1) command |= FTRACE_ENABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -729,7 +681,7 @@ static void ftrace_startup(void) ftrace_run_update_code(command); out: - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static void ftrace_shutdown(void) @@ -739,9 +691,9 @@ static void ftrace_shutdown(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); - ftraced_suspend--; - if (!ftraced_suspend) + mutex_lock(&ftrace_start_lock); + ftrace_start--; + if (!ftrace_start) command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -754,7 +706,7 @@ static void ftrace_shutdown(void) ftrace_run_update_code(command); out: - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static void ftrace_startup_sysctl(void) @@ -764,15 +716,15 @@ static void ftrace_startup_sysctl(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); + mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; - /* ftraced_suspend is true if we want ftrace running */ - if (ftraced_suspend) + /* ftrace_start is true if we want ftrace running */ + if (ftrace_start) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static void ftrace_shutdown_sysctl(void) @@ -782,20 +734,20 @@ static void ftrace_shutdown_sysctl(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); - /* ftraced_suspend is true if ftrace is running */ - if (ftraced_suspend) + mutex_lock(&ftrace_start_lock); + /* ftrace_start is true if ftrace is running */ + if (ftrace_start) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int __ftrace_update_code(void *ignore) +static int ftrace_update_code(void *ignore) { int i, save_ftrace_enabled; cycle_t start, stop; @@ -869,7 +821,6 @@ static int __ftrace_update_code(void *ignore) stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - ftraced_trigger = 0; ftrace_enabled = save_ftrace_enabled; ftrace_record_suspend--; @@ -877,17 +828,6 @@ static int __ftrace_update_code(void *ignore) return 0; } -static int ftrace_update_code(void) -{ - if (unlikely(ftrace_disabled) || - !ftrace_enabled || !ftraced_trigger) - return 0; - - stop_machine(__ftrace_update_code, NULL, NULL); - - return 1; -} - static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) { struct ftrace_page *pg; @@ -1425,10 +1365,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (iter->filtered && ftraced_suspend && ftrace_enabled) + mutex_lock(&ftrace_start_lock); + if (iter->filtered && ftrace_start && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); kfree(iter); @@ -1448,55 +1388,6 @@ ftrace_notrace_release(struct inode *inode, struct file *file) return ftrace_regex_release(inode, file, 0); } -static ssize_t -ftraced_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - /* don't worry about races */ - char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; - int r = strlen(buf); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -ftraced_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - long val; - int ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - if (strncmp(buf, "enable", 6) == 0) - val = 1; - else if (strncmp(buf, "disable", 7) == 0) - val = 0; - else { - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - val = !!val; - } - - if (val) - ftrace_enable_daemon(); - else - ftrace_disable_daemon(); - - filp->f_pos += cnt; - - return cnt; -} - static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1527,38 +1418,6 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; -static struct file_operations ftraced_fops = { - .open = tracing_open_generic, - .read = ftraced_read, - .write = ftraced_write, -}; - -/** - * ftrace_force_update - force an update to all recording ftrace functions - */ -int ftrace_force_update(void) -{ - int ret = 0; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - - /* - * If ftraced_trigger is not set, then there is nothing - * to update. - */ - if (ftraced_trigger && !ftrace_update_code()) - ret = -EBUSY; - - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - return ret; -} - static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1589,17 +1448,11 @@ static __init int ftrace_init_debugfs(void) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); - entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, - NULL, &ftraced_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ftraced_enabled' entry\n"); return 0; } fs_initcall(ftrace_init_debugfs); -#ifdef CONFIG_FTRACE_MCOUNT_RECORD static int ftrace_convert_nops(unsigned long *start, unsigned long *end) { @@ -1619,7 +1472,7 @@ static int ftrace_convert_nops(unsigned long *start, /* p is ignored */ local_irq_save(flags); - __ftrace_update_code(p); + ftrace_update_code(p); local_irq_restore(flags); return 0; @@ -1666,122 +1519,6 @@ void __init ftrace_init(void) failed: ftrace_disabled = 1; } -#else /* CONFIG_FTRACE_MCOUNT_RECORD */ - -static void ftrace_release_hash(unsigned long start, unsigned long end) -{ - struct dyn_ftrace *rec; - struct hlist_node *t, *n; - struct hlist_head *head, temp_list; - unsigned long flags; - int i, cpu; - - preempt_disable_notrace(); - - /* disable incase we call something that calls mcount */ - cpu = raw_smp_processor_id(); - per_cpu(ftrace_shutdown_disable_cpu, cpu)++; - - ftrace_hash_lock(flags); - - for (i = 0; i < FTRACE_HASHSIZE; i++) { - INIT_HLIST_HEAD(&temp_list); - head = &ftrace_hash[i]; - - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(rec, t, n, head, node) { - if (rec->flags & FTRACE_FL_FREE) - continue; - - if ((rec->ip >= start) && (rec->ip < end)) - ftrace_free_rec(rec); - } - } - - ftrace_hash_unlock(flags); - - per_cpu(ftrace_shutdown_disable_cpu, cpu)--; - preempt_enable_notrace(); - -} - -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - FTRACE_WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dynamic_init(void) -{ - struct task_struct *p; - unsigned long addr; - int ret; - - addr = (unsigned long)ftrace_record_ip; - - stop_machine(ftrace_dyn_arch_init, &addr, NULL); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) { - ret = (int)addr; - goto failed; - } - - ret = ftrace_dyn_table_alloc(NR_TO_INIT); - if (ret) - goto failed; - - p = kthread_run(ftraced, NULL, "ftraced"); - if (IS_ERR(p)) { - ret = -1; - goto failed; - } - - last_ftrace_enabled = ftrace_enabled = 1; - ftraced_task = p; - - return 0; - - failed: - ftrace_disabled = 1; - return ret; -} - -core_initcall(ftrace_dynamic_init); -#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ #else # define ftrace_startup() do { } while (0) @@ -1801,9 +1538,6 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; -#ifdef CONFIG_DYNAMIC_FTRACE - ftraced_suspend = -1; -#endif clear_ftrace_function(); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 95815d2..90bc752 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -99,13 +99,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* passed in by parameter to fool gcc from optimizing */ func(); - /* update the records */ - ret = ftrace_force_update(); - if (ret) { - printk(KERN_CONT ".. ftraced failed .. "); - return ret; - } - /* * Some archs *cough*PowerPC*cough* add charachters to the * start of the function names. We simply put a '*' to @@ -183,13 +176,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* make sure msleep has been recorded */ msleep(1); - /* force the recorded functions to be traced */ - ret = ftrace_force_update(); - if (ret) { - printk(KERN_CONT ".. ftraced failed .. "); - return ret; - } - /* start the tracing */ ftrace_enabled = 1; tracer_enabled = 1; -- cgit v1.1 From 4d296c24326783bff1282ac72f310d8bac8df413 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:06 -0400 Subject: ftrace: remove mcount set The arch dependent function ftrace_mcount_set was only used by the daemon start up code. This patch removes it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e758cab..226fd91 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -620,7 +620,6 @@ static int ftrace_update_code(void *ignore); static int __ftrace_modify_code(void *data) { - unsigned long addr; int *command = data; if (*command & FTRACE_ENABLE_CALLS) { @@ -639,14 +638,6 @@ static int __ftrace_modify_code(void *data) if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); - if (*command & FTRACE_ENABLE_MCOUNT) { - addr = (unsigned long)ftrace_record_ip; - ftrace_mcount_set(&addr); - } else if (*command & FTRACE_DISABLE_MCOUNT) { - addr = (unsigned long)ftrace_stub; - ftrace_mcount_set(&addr); - } - return 0; } -- cgit v1.1 From 08f5ac906d2c0faf96d608c54a0b03177376da8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:07 -0400 Subject: ftrace: remove ftrace hash The ftrace hash was used by the ftrace_daemon code. The record ip function would place the calling address (ip) into the hash. The daemon would later read the hash and modify that code. The hash complicates the code. This patch removes it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 243 ++++++++------------------------------------------ kernel/trace/trace.c | 3 - 2 files changed, 35 insertions(+), 211 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 226fd91..07762c0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -189,9 +188,7 @@ static int ftrace_filtered; static int tracing_on; static int frozen_record_count; -static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; - -static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); +static LIST_HEAD(ftrace_new_addrs); static DEFINE_MUTEX(ftrace_regex_lock); @@ -210,8 +207,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static int ftrace_record_suspend; - static struct dyn_ftrace *ftrace_free_records; @@ -242,72 +237,6 @@ static inline int record_frozen(struct dyn_ftrace *rec) # define record_frozen(rec) ({ 0; }) #endif /* CONFIG_KPROBES */ -int skip_trace(unsigned long ip) -{ - unsigned long fl; - struct dyn_ftrace *rec; - struct hlist_node *t; - struct hlist_head *head; - - if (frozen_record_count == 0) - return 0; - - head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)]; - hlist_for_each_entry_rcu(rec, t, head, node) { - if (rec->ip == ip) { - if (record_frozen(rec)) { - if (rec->flags & FTRACE_FL_FAILED) - return 1; - - if (!(rec->flags & FTRACE_FL_CONVERTED)) - return 1; - - if (!tracing_on || !ftrace_enabled) - return 1; - - if (ftrace_filtered) { - fl = rec->flags & (FTRACE_FL_FILTER | - FTRACE_FL_NOTRACE); - if (!fl || (fl & FTRACE_FL_NOTRACE)) - return 1; - } - } - break; - } - } - - return 0; -} - -static inline int -ftrace_ip_in_hash(unsigned long ip, unsigned long key) -{ - struct dyn_ftrace *p; - struct hlist_node *t; - int found = 0; - - hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { - if (p->ip == ip) { - found = 1; - break; - } - } - - return found; -} - -static inline void -ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) -{ - hlist_add_head_rcu(&node->node, &ftrace_hash[key]); -} - -/* called from kstop_machine */ -static inline void ftrace_del_hash(struct dyn_ftrace *node) -{ - hlist_del(&node->node); -} - static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->ip = (unsigned long)ftrace_free_records; @@ -362,69 +291,36 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) } if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) - return NULL; + if (!ftrace_pages->next) { + /* allocate another page */ + ftrace_pages->next = + (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages->next) + return NULL; + } ftrace_pages = ftrace_pages->next; } return &ftrace_pages->records[ftrace_pages->index++]; } -static void +static struct dyn_ftrace * ftrace_record_ip(unsigned long ip) { - struct dyn_ftrace *node; - unsigned long key; - int resched; - int cpu; + struct dyn_ftrace *rec; if (!ftrace_enabled || ftrace_disabled) - return; - - resched = need_resched(); - preempt_disable_notrace(); - - /* - * We simply need to protect against recursion. - * Use the the raw version of smp_processor_id and not - * __get_cpu_var which can call debug hooks that can - * cause a recursive crash here. - */ - cpu = raw_smp_processor_id(); - per_cpu(ftrace_shutdown_disable_cpu, cpu)++; - if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) - goto out; - - if (unlikely(ftrace_record_suspend)) - goto out; - - key = hash_long(ip, FTRACE_HASHBITS); - - FTRACE_WARN_ON_ONCE(key >= FTRACE_HASHSIZE); - - if (ftrace_ip_in_hash(ip, key)) - goto out; - - /* This ip may have hit the hash before the lock */ - if (ftrace_ip_in_hash(ip, key)) - goto out; - - node = ftrace_alloc_dyn_node(ip); - if (!node) - goto out; + return NULL; - node->ip = ip; + rec = ftrace_alloc_dyn_node(ip); + if (!rec) + return NULL; - ftrace_add_hash(node, key); + rec->ip = ip; - out: - per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + list_add(&rec->list, &ftrace_new_addrs); - /* prevent recursion with scheduler */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + return rec; } #define FTRACE_ADDR ((long)(ftrace_caller)) @@ -543,7 +439,6 @@ static void ftrace_replace_code(int enable) rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { - ftrace_del_hash(rec); ftrace_free_rec(rec); } } @@ -551,15 +446,6 @@ static void ftrace_replace_code(int enable) } } -static void ftrace_shutdown_replenish(void) -{ - if (ftrace_pages->next) - return; - - /* allocate another page */ - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); -} - static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -616,18 +502,11 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } -static int ftrace_update_code(void *ignore); - static int __ftrace_modify_code(void *data) { int *command = data; if (*command & FTRACE_ENABLE_CALLS) { - /* - * Update any recorded ips now that we have the - * machine stopped - */ - ftrace_update_code(NULL); ftrace_replace_code(1); tracing_on = 1; } else if (*command & FTRACE_DISABLE_CALLS) { @@ -738,84 +617,34 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ftrace_update_code(void *ignore) +static int ftrace_update_code(void) { - int i, save_ftrace_enabled; + struct dyn_ftrace *p, *t; cycle_t start, stop; - struct dyn_ftrace *p; - struct hlist_node *t, *n; - struct hlist_head *head, temp_list; - - /* Don't be recording funcs now */ - ftrace_record_suspend++; - save_ftrace_enabled = ftrace_enabled; - ftrace_enabled = 0; start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - /* No locks needed, the machine is stopped! */ - for (i = 0; i < FTRACE_HASHSIZE; i++) { - INIT_HLIST_HEAD(&temp_list); - head = &ftrace_hash[i]; + list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(p, t, n, head, node) { - /* Skip over failed records which have not been - * freed. */ - if (p->flags & FTRACE_FL_FAILED) - continue; + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - /* Unconverted records are always at the head of the - * hash bucket. Once we encounter a converted record, - * simply skip over to the next bucket. Saves ftraced - * some processor cycles (ftrace does its bid for - * global warming :-p ). */ - if (p->flags & (FTRACE_FL_CONVERTED)) - break; + list_del_init(&p->list); - /* Ignore updates to this record's mcount site. - * Reintroduce this record at the head of this - * bucket to attempt to "convert" it again if - * the kprobe on it is unregistered before the - * next run. */ - if (get_kprobe((void *)p->ip)) { - ftrace_del_hash(p); - INIT_HLIST_NODE(&p->node); - hlist_add_head(&p->node, &temp_list); - freeze_record(p); - continue; - } else { - unfreeze_record(p); - } - - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else { - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(p->ip)) { - ftrace_del_hash(p); - ftrace_free_rec(p); - } - } - } - - hlist_for_each_entry_safe(p, t, n, &temp_list, node) { - hlist_del(&p->node); - INIT_HLIST_NODE(&p->node); - hlist_add_head(&p->node, head); - } + /* convert record (i.e, patch mcount-call with NOP) */ + if (ftrace_code_disable(p)) { + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + } else + ftrace_free_rec(p); } stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - ftrace_enabled = save_ftrace_enabled; - ftrace_record_suspend--; - return 0; } @@ -847,7 +676,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) pg = ftrace_pages = ftrace_pages_start; cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld hash entries in %d pages\n", + pr_info("ftrace: allocating %ld entries in %d pages\n", num_to_init, cnt); for (i = 0; i < cnt; i++) { @@ -1451,20 +1280,18 @@ static int ftrace_convert_nops(unsigned long *start, unsigned long addr; unsigned long flags; + mutex_lock(&ftrace_start_lock); p = start; while (p < end) { addr = ftrace_call_adjust(*p++); - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); ftrace_record_ip(addr); - spin_unlock(&ftrace_lock); - ftrace_shutdown_replenish(); } - /* p is ignored */ + /* disable interrupts to prevent kstop machine */ local_irq_save(flags); - ftrace_update_code(p); + ftrace_update_code(); local_irq_restore(flags); + mutex_unlock(&ftrace_start_lock); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 333a516..06951e2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -865,9 +865,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!ftrace_function_enabled)) return; - if (skip_trace(ip)) - return; - pc = preempt_count(); resched = need_resched(); preempt_disable_notrace(); -- cgit v1.1 From 66b0de3569b00f61978782b9f97aa4803dbec0fb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Oct 2008 16:11:03 +0200 Subject: ftrace: fix build failure fix: kernel/trace/ftrace.c: In function 'ftrace_release': kernel/trace/ftrace.c:271: error: implicit declaration of function 'ftrace_release_hash' release_hash is not needed without dftraced. Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 07762c0..2721232 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -267,8 +267,6 @@ void ftrace_release(void *start, unsigned long size) } } spin_unlock(&ftrace_lock); - - ftrace_release_hash(s, e); } static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -- cgit v1.1 From acff181d3574244e651913df77332e897b88bff4 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Oct 2008 19:49:09 -0400 Subject: printk: remove unused code from kernel/printk.c both log_buf_copy() and log_buf_len are unused. Signed-off-by: Ingo Molnar --- kernel/printk.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6341af7..f492f15 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -233,45 +233,6 @@ static inline void boot_delay_msec(void) #endif /* - * Return the number of unread characters in the log buffer. - */ -static int log_buf_get_len(void) -{ - return logged_chars; -} - -/* - * Copy a range of characters from the log buffer. - */ -int log_buf_copy(char *dest, int idx, int len) -{ - int ret, max; - bool took_lock = false; - - if (!oops_in_progress) { - spin_lock_irq(&logbuf_lock); - took_lock = true; - } - - max = log_buf_get_len(); - if (idx < 0 || idx >= max) { - ret = -1; - } else { - if (len > max) - len = max; - ret = len; - idx += (log_end - max); - while (len-- > 0) - dest[len] = LOG_BUF(idx + len); - } - - if (took_lock) - spin_unlock_irq(&logbuf_lock); - - return ret; -} - -/* * Commands to do_syslog: * * 0 -- Close the log. Currently a NOP. -- cgit v1.1 From bea92112415635ecb7e681355834413c7c048f67 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 22 Oct 2008 19:31:11 +0900 Subject: kernel/resource: fix reserve_region_with_split() section mismatch Impact: cleanup, small kernel text size reduction, no functionality changed reserve_region_with_split() calls in to __reserve_region_with_split(), which is an __init function. The only caller of reserve_region_with_split() is an __init function, so make it __init too. Signed-off-by: Paul Mundt Signed-off-by: Ingo Molnar --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4089d12..7fec0e4 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -571,7 +571,7 @@ static void __init __reserve_region_with_split(struct resource *root, } -void reserve_region_with_split(struct resource *root, +void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { -- cgit v1.1 From 01c8c57d668d94f1036d9ab11a22aa24ca16a35d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:12 +0200 Subject: sched: fix a find_busiest_group buglet In one of the group load balancer patches: commit 408ed066b11cf9ee4536573b4269ee3613bd735e Author: Peter Zijlstra Date: Fri Jun 27 13:41:28 2008 +0200 Subject: sched: hierarchical load vs find_busiest_group The following change: - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + if (max_load - this_load + 2*busiest_load_per_task >= busiest_load_per_task * imbn) { made the condition always true, because imbn is [1,2]. Therefore, remove the 2*, and give the it a fair chance. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6625c3c..12bc367 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3344,7 +3344,7 @@ small_imbalance: } else this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (max_load - this_load + 2*busiest_load_per_task >= + if (max_load - this_load + busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; -- cgit v1.1 From 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:13 +0200 Subject: sched: more accurate min_vruntime accounting Mike noticed the current min_vruntime tracking can go wrong and skip the current task. If the only remaining task in the tree is a nice 19 task with huge vruntime, new tasks will be inserted too far to the right too, causing some interactibity issues. min_vruntime can only change due to the leftmost entry disappearing (dequeue_entity()), or by the leftmost entry being incremented past the next entry, which elects a new leftmost (__update_curr()) Due to the current entry not being part of the actual tree, we have to compare the leftmost tree entry with the current entry, and take the leftmost of these two. So create a update_min_vruntime() function that takes computes the leftmost vruntime in the system (either tree of current) and increases the cfs_rq->min_vruntime if the computed value is larger than the previously found min_vruntime. And call this from the two sites we've identified that can change min_vruntime. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42d211f..b27ccc5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -223,6 +223,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) return se->vruntime - cfs_rq->min_vruntime; } +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + u64 vruntime = cfs_rq->min_vruntime; + + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + + if (cfs_rq->rb_leftmost) { + struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, + struct sched_entity, + run_node); + + if (vruntime == cfs_rq->min_vruntime) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); + } + + cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +} + /* * Enqueue an entity into the rb-tree: */ @@ -256,15 +277,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) { + if (leftmost) cfs_rq->rb_leftmost = &se->run_node; - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, se->vruntime); - } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -274,18 +288,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; - struct sched_entity *next; next_node = rb_next(&se->run_node); cfs_rq->rb_leftmost = next_node; - - if (next_node) { - next = rb_entry(next_node, - struct sched_entity, run_node); - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, - next->vruntime); - } } if (cfs_rq->next == se) @@ -424,6 +429,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; + update_min_vruntime(cfs_rq); } static void update_curr(struct cfs_rq *cfs_rq) @@ -613,13 +619,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 vruntime; - - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(cfs_rq->min_vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = cfs_rq->min_vruntime; + u64 vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -696,6 +696,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); + update_min_vruntime(cfs_rq); } /* -- cgit v1.1 From 0d13033bc9257fe65c1aa25e84568b1608da0901 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 24 Oct 2008 11:06:14 +0200 Subject: sched: weaken sync hint Mysql+oltp and pgsql+oltp peaks are still shifted right. The below puts the peaks back to 1 client/server pair per core. Use the avg_overlap information to weaken the sync hint. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b27ccc5..b71ee2c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1123,10 +1123,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; - if (!sync && sched_feat(SYNC_WAKEUPS) && - curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - sync = 1; + if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; /* * If sync wakeup then subtract the (maximum possible) -- cgit v1.1 From 464b75273f64be7c81fee975bd6ca9593df3427b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:15 +0200 Subject: sched: re-instate vruntime based wakeup preemption The advantage is that vruntime based wakeup preemption has a better conceptual model. Here wakeup_gran = 0 means: preempt when 'fair'. Therefore wakeup_gran is the granularity of unfairness we allow in order to make progress. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b71ee2c..7af17e0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -143,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + +static void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ + int se_depth, pse_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(*se); + pse_depth = depth_se(*pse); + + while (se_depth > pse_depth) { + se_depth--; + *se = parent_entity(*se); + } + + while (pse_depth > se_depth) { + pse_depth--; + *pse = parent_entity(*pse); + } + + while (!is_same_group(*se, *pse)) { + *se = parent_entity(*se); + *pse = parent_entity(*pse); + } +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) @@ -193,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return NULL; } +static inline void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1244,13 +1292,42 @@ static unsigned long wakeup_gran(struct sched_entity *se) * More easily preempt - nice tasks, while not making it harder for * + nice tasks. */ - if (sched_feat(ASYM_GRAN)) - gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); + if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } /* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff <= 0) + return -1; + + gran = wakeup_gran(curr); + if (vdiff > gran) + return 1; + + return 0; +} + +/* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) @@ -1258,7 +1335,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1296,9 +1372,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; - if (delta_exec > wakeup_gran(pse)) - resched_task(curr); + find_matching_se(&se, &pse); + + while (se) { + BUG_ON(!pse); + + if (wakeup_preempt_entity(se, pse) == 1) { + resched_task(curr); + break; + } + + se = parent_entity(se); + pse = parent_entity(pse); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.1 From 3f3a490480d8ab96e0fe30a41f80f14e6a0c579d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:16 +0200 Subject: sched: virtual time buddy preemption Since we moved wakeup preemption back to virtual time, it makes sense to move the buddy stuff back as well. The purpose of the buddy scheduling is to allow a quickly scheduling pair of tasks to run away from the group as far as a regular busy task would be allowed under wakeup preemption. This has the advantage that the pair can ping-pong for a while, enjoying cache-hotness. Without buddy scheduling other tasks would interleave destroying the cache. Also, it saves a word in cfs_rq. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - kernel/sched_fair.c | 10 ++++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 12bc367..e8819bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -386,7 +386,6 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; - u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7af17e0..ce514af 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -791,16 +791,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rq *rq = rq_of(cfs_rq); - u64 pair_slice = rq->clock - cfs_rq->pair_start; - - if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { - cfs_rq->pair_start = rq->clock; + if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1) return se; - } return cfs_rq->next; } -- cgit v1.1 From f17845e5d97ead8fbdadfd40039e058ec7cf4a42 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Oct 2008 12:47:10 +0200 Subject: ftrace: warning in kernel/trace/ftrace.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/trace/ftrace.c:189: warning: ‘frozen_record_count’ defined but not used triggers because frozen_record_count is only used in the KCONFIG_MARKERS case. Move the variable it there. Alas, this frozen-record facility seems to have little use. The frozen_record_count variable is not used by anything, nor the flags. So this section might need a bit of dead-code-removal care as well. Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2721232..7618c52 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -186,7 +186,6 @@ enum { static int ftrace_filtered; static int tracing_on; -static int frozen_record_count; static LIST_HEAD(ftrace_new_addrs); @@ -211,6 +210,9 @@ static struct dyn_ftrace *ftrace_free_records; #ifdef CONFIG_KPROBES + +static int frozen_record_count; + static inline void freeze_record(struct dyn_ftrace *rec) { if (!(rec->flags & FTRACE_FL_FROZEN)) { @@ -1443,3 +1445,4 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_sysctl_lock); return ret; } + -- cgit v1.1 From e2862c9470beb842d3f1c1965b03a2112114c160 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 27 Oct 2008 17:43:28 +1100 Subject: trace: fix printk warning for u64 A powerpc ppc64_defconfig build produces these warnings: kernel/trace/ring_buffer.c: In function 'rb_add_time_stamp': kernel/trace/ring_buffer.c:969: warning: format '%llu' expects type 'long long unsigned int', but argument 2 has type 'u64' kernel/trace/ring_buffer.c:969: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'u64' kernel/trace/ring_buffer.c:969: warning: format '%llu' expects type 'long long unsigned int', but argument 4 has type 'u64' Just cast the u64s to unsigned long long like we do everywhere else. Signed-off-by: Stephen Rothwell Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 091aeef..cedf4e2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -966,7 +966,9 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(*delta > (1ULL << 59) && !once++)) { printk(KERN_WARNING "Delta way too big! %llu" " ts=%llu write stamp = %llu\n", - *delta, *ts, cpu_buffer->write_stamp); + (unsigned long long)*delta, + (unsigned long long)*ts, + (unsigned long long)cpu_buffer->write_stamp); WARN_ON(1); } -- cgit v1.1 From f66af459a931f25807e1df7915b2b66bb5978d82 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Oct 2008 19:14:55 +0200 Subject: tracepoint: check if the probe has been registered Impact: fix kernel crash that can trigger during tracing If we try to remove a probe that has not been already registered, the tracepoint_entry_remove_probe() function will dereference a NULL pointer. Check the probe before removing it to avoid crashes. Signed-off-by: Frederic Weisbecker Acked-by: Mathieu Desnoyers Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f2b7c28..af8c856 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -131,6 +131,9 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; + if (!old) + return NULL; + debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ for (nr_probes = 0; old[nr_probes]; nr_probes++) { @@ -388,6 +391,11 @@ int tracepoint_probe_unregister(const char *name, void *probe) if (entry->rcu_pending) rcu_barrier_sched(); old = tracepoint_entry_remove_probe(entry, probe); + if (!old) { + printk(KERN_WARNING "Warning: Trying to unregister a probe" + "that doesn't exist\n"); + goto end; + } mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ mutex_lock(&tracepoints_mutex); -- cgit v1.1 From ea31e72d753e5817a97de552f152d0cb55c7defc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Oct 2008 19:26:23 +0200 Subject: tracing/ftrace: make boot tracer select the sched_switch tracer Impact: build fix If the boot tracer is selected but not the sched_switch, there will be a build failure: kernel/built-in.o: In function `boot_trace_init': trace_boot.c:(.text+0x5ee38): undefined reference to `sched_switch_trace' kernel/built-in.o: In function `disable_boot_trace': (.text+0x5eee1): undefined reference to `tracing_stop_cmdline_record' kernel/built-in.o: In function `enable_boot_trace': (.text+0x5ef11): undefined reference to `tracing_start_cmdline_record' This patch fixes it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bc535cb..e0cea28 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -123,6 +123,7 @@ config BOOT_TRACER bool "Trace boot initcalls" depends on DEBUG_KERNEL select TRACING + select CONTEXT_SWITCH_TRACER help This tracer helps developers to optimize boot times: it records the timings of the initcalls and traces key events and the identity -- cgit v1.1 From 21798a84ab383cdac0e7ee3368e0792b718b867d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 28 Oct 2008 09:43:26 +0100 Subject: tracing: fix a build error on alpha Impact: build fix on Alpha When tracing is enabled, some arch have included on their but others like alpha or m68k don't. Build error on alpha: kernel/trace/trace.c: In function 'tracing_cpumask_write': kernel/trace/trace.c:2145: error: implicit declaration of function 'raw_local_irq_disable' kernel/trace/trace.c:2162: error: implicit declaration of function 'raw_local_irq_enable' Tested on Alpha through a cross-compiler (should correct a similar issue on m68k). Reported-by: Alexey Dobriyan Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06951e2..bc577dc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -34,6 +34,7 @@ #include #include +#include #include "trace.h" -- cgit v1.1 From 46fec7ac40e452a2ea5e63648d98b6bb2b5898f9 Mon Sep 17 00:00:00 2001 From: qinghuang feng Date: Tue, 28 Oct 2008 17:24:28 +0800 Subject: lockdep: minor fix for debug_show_all_locks() When we failed to get tasklist_lock eventually (count equals 0), we should only print " ignoring it.\n", and not print " locked it.\n" needlessly. Signed-off-by: Qinghuang Feng Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index dbda475..11832ac 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3417,9 +3417,10 @@ retry: } printk(" ignoring it.\n"); unlock = 0; + } else { + if (count != 10) + printk(KERN_CONT " locked it.\n"); } - if (count != 10) - printk(" locked it.\n"); do_each_thread(g, p) { /* -- cgit v1.1 From 6afe40b4dace385d7ba2faf24b352f066f3b71bf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 28 Oct 2008 11:14:58 +0100 Subject: lockdep: fix irqs on/off ip tracing Impact: fix lockdep lock-api-caller output when irqsoff tracing is enabled 81d68a96 "ftrace: trace irq disabled critical timings" added wrappers around trace_hardirqs_on/off_caller. However these functions use __builtin_return_address(0) to figure out which function actually disabled or enabled irqs. The result is that we save the ips of trace_hardirqs_on/off instead of the real caller. Not very helpful. However since the patch from Steven the ip already gets passed. So use that and get rid of __builtin_return_address(0) in these two functions. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 11832ac..06e1571 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2169,12 +2169,11 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on_caller(unsigned long a0) +void trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; - unsigned long ip; - time_hardirqs_on(CALLER_ADDR0, a0); + time_hardirqs_on(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2188,7 +2187,6 @@ void trace_hardirqs_on_caller(unsigned long a0) } /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; - ip = (unsigned long) __builtin_return_address(0); if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2224,11 +2222,11 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off_caller(unsigned long a0) +void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_off(CALLER_ADDR0, a0); + time_hardirqs_off(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2241,7 +2239,7 @@ void trace_hardirqs_off_caller(unsigned long a0) * We have done an ON -> OFF transition: */ curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = _RET_IP_; + curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); } else -- cgit v1.1 From 60063a66236c15f5613f91390631e06718689782 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2008 10:44:24 -0400 Subject: ftrace: fix current_tracer error return The commit (in linux-tip) c2931e05ec5965597cbfb79ad332d4a29aeceb23 ( ftrace: return an error when setting a nonexistent tracer ) added useful code that would error when a bad tracer was written into the current_tracer file. But this had a bug if the amount written was more than the amount read by that code. The first iteration would set the tracer correctly, but since it did not consume the rest of what was written (usually whitespace), the userspace utility would continue to write what was not consumed. This second iteration would fail to find a tracer and return -EINVAL. Funny thing is that the tracer would have already been set. This patch just consumes all the data that is written to the file. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc577dc..a610ca7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2377,9 +2377,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, int i; size_t ret; + ret = cnt; + if (cnt > max_tracer_type_len) cnt = max_tracer_type_len; - ret = cnt; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2412,8 +2413,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); - if (ret == cnt) - filp->f_pos += cnt; + if (ret > 0) + filp->f_pos += ret; return ret; } -- cgit v1.1 From 0b6e4d56bf71866a2b58daa8323cf747988ce7e4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 28 Oct 2008 20:17:38 +0100 Subject: ftrace: perform an initialization for ftrace to enable it Impact: corrects a bug which made the non-dyn function tracer not functional With latest git, the non-dynamic function tracer didn't get any trace. The problem was the fact that ftrace_enabled wasn't initialized to 1 because ftrace hasn't any init function when DYNAMIC_FTRACE is disabled. So when a tracer tries to register an ftrace_ops struct, __register_ftrace_function failed to set the hook. This patch corrects it by setting an init function to initialize ftrace during the boot. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7618c52..4a39d24 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1339,6 +1339,14 @@ void __init ftrace_init(void) } #else + +static int __init ftrace_nodyn_init(void) +{ + ftrace_enabled = 1; + return 0; +} +device_initcall(ftrace_nodyn_init); + # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) -- cgit v1.1 From d68612b257b5f4ea2e6535859c5a26b10011a9df Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 28 Oct 2008 11:45:42 -0700 Subject: resources: fix x86info results ioremap.c:226 __ioremap_caller+0xf2/0x2d6() WARNINGs Impact: avoid false-positive WARN_ON() Andi Kleen reported: > When running x86info on a 2.6.27-git8 system I get > > resource map sanity check conflict: 0x9e000 0x9efff 0x10000 0x9e7ff System RAM > ------------[ cut here ]------------ > WARNING: at /home/lsrc/linux/arch/x86/mm/ioremap.c:226 __ioremap_caller+0xf2/0x2d6() > ... Some of the pages below the 1MB ISA addresses will be shared typically by both BIOS and system usable RAM. For example: BIOS-e820: 0000000000000000 - 000000000009f800 (usable) BIOS-e820: 000000000009f800 - 00000000000a0000 (reserved) x86info reads the low physical address using /dev/mem, which internally uses ioremap() for accessing non RAM pages. ioremap() of such low pages conflicts with multiple resource entities leading to the above warning. Change the iomem_map_sanity_check() to allow mapping a page spanning multiple resource entities (minimum granularity that one can map is a page anyhow). Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/resource.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7fec0e4..6aac5c6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -849,7 +850,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) continue; if (p->end < addr) continue; - if (p->start <= addr && (p->end >= addr + size - 1)) + if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && + PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) continue; printk(KERN_WARNING "resource map sanity check conflict: " "0x%llx 0x%llx 0x%llx 0x%llx %s\n", -- cgit v1.1 From f3384b28a05624783b53836ccfed95ecde66a7ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Oct 2008 11:15:57 -0400 Subject: ftrace: fix trace_nop config select Impact: build fix on non-function-tracing architectures The trace_nop is the tracer that is defined when no tracer is set in the ftrace infrastructure. The trace_nop was mistakenly selected by HAVE_FTRACE due to the confusion between ftrace infrastructure and the ftrace function tracer (which has been solved by renaming the function tracer). This patch changes the select to the approriate TRACING. This patch should fix compile errors on architectures that do not define the FUNCTION_TRACER. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e0cea28..b58f43b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -8,7 +8,6 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool - select NOP_TRACER config HAVE_DYNAMIC_FTRACE bool @@ -28,6 +27,7 @@ config TRACING select RING_BUFFER select STACKTRACE select TRACEPOINTS + select NOP_TRACER menu "Tracers" -- cgit v1.1 From a9cf4ddb3b2ce03c3027929b22a920aeff933009 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 30 Oct 2008 15:23:34 +0800 Subject: sched: change sched_debug's mode to 0444 Impact: change /proc/sched/debug from rw-r--r-- to r--r--r-- /proc/sched_debug is read-only. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ad958c1..5ae1776 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -319,7 +319,7 @@ static int __init init_sched_debug_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); + pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); if (!pe) return -ENOMEM; return 0; -- cgit v1.1 From 7ccb97437bcc818d0ba6067513475f6ee8177a15 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:51 -0700 Subject: freezer_cg: fix improper BUG_ON() causing oops The BUG_ON() should be protected by freezer->lock, otherwise it can be triggered easily when a task has been unfreezed but the corresponding cgroup hasn't been changed to FROZEN state. Signed-off-by: Li Zefan Acked-by: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e950569..7f54d1c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -190,8 +190,9 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) freezer = task_freezer(task); task_unlock(task); - BUG_ON(freezer->state == CGROUP_FROZEN); spin_lock_irq(&freezer->lock); + BUG_ON(freezer->state == CGROUP_FROZEN); + /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) freeze_task(task, true); -- cgit v1.1 From 80a6a2cf3bebcf20285cf05373b9c5ec96816577 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:52 -0700 Subject: freezer_cg: remove redundant check in freezer_can_attach() It is sufficient to check if @task is frozen, and no need to check if the original freezer is frozen. Signed-off-by: Li Zefan Acked-by: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7f54d1c..e9c856a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -162,9 +162,13 @@ static int freezer_can_attach(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; - int retval; - /* Anything frozen can't move or be moved to/from */ + /* + * Anything frozen can't move or be moved to/from. + * + * Since orig_freezer->state == FROZEN means that @task has been + * frozen, so it's sufficient to check the latter condition. + */ if (is_task_frozen_enough(task)) return -EBUSY; @@ -173,13 +177,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; - retval = 0; - task_lock(task); - freezer = task_freezer(task); - if (freezer->state == CGROUP_FROZEN) - retval = -EBUSY; - task_unlock(task); - return retval; + return 0; } static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) -- cgit v1.1 From 00c2e63c31d0f431952ff2a671c5c6997dd4f8b2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:53 -0700 Subject: freezer_cg: use thaw_process() in unfreeze_cgroup() Don't duplicate the implementation of thaw_process(). [akpm@linux-foundation.org: make __thaw_process() static] Signed-off-by: Li Zefan Cc: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 15 ++++----------- kernel/freezer.c | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e9c856a..5e6d26b 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -275,25 +275,18 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) return num_cant_freeze_now ? -EBUSY : 0; } -static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - int do_wake; - - task_lock(task); - do_wake = __thaw_process(task); - task_unlock(task); - if (do_wake) - wake_up_process(task); + thaw_process(task); } cgroup_iter_end(cgroup, &it); - freezer->state = CGROUP_THAWED; - return 0; + freezer->state = CGROUP_THAWED; } static int freezer_change_state(struct cgroup *cgroup, @@ -320,7 +313,7 @@ static int freezer_change_state(struct cgroup *cgroup, } /* state == FREEZING and goal_state == THAWED, so unfreeze */ case CGROUP_FROZEN: - retval = unfreeze_cgroup(cgroup, freezer); + unfreeze_cgroup(cgroup, freezer); break; default: break; diff --git a/kernel/freezer.c b/kernel/freezer.c index ba6248b..2f4936c 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -121,16 +121,7 @@ void cancel_freezing(struct task_struct *p) } } -/* - * Wake up a frozen process - * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. - */ -int __thaw_process(struct task_struct *p) +static int __thaw_process(struct task_struct *p) { if (frozen(p)) { p->flags &= ~PF_FROZEN; @@ -140,6 +131,15 @@ int __thaw_process(struct task_struct *p) return 0; } +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ int thaw_process(struct task_struct *p) { task_lock(p); -- cgit v1.1 From 51308ee59dee1136ed599d875ea8968d7be55c91 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:54 -0700 Subject: freezer_cg: simplify freezer_change_state() Just call unfreeze_cgroup() if goal_state == THAWED, and call try_to_freeze_cgroup() if goal_state == FROZEN. No behavior has been changed. Signed-off-by: Li Zefan Acked-by: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e6d26b..7fa476f 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -296,27 +296,22 @@ static int freezer_change_state(struct cgroup *cgroup, int retval = 0; freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + update_freezer_state(cgroup, freezer); if (goal_state == freezer->state) goto out; - switch (freezer->state) { + + switch (goal_state) { case CGROUP_THAWED: - retval = try_to_freeze_cgroup(cgroup, freezer); + unfreeze_cgroup(cgroup, freezer); break; - case CGROUP_FREEZING: - if (goal_state == CGROUP_FROZEN) { - /* Userspace is retrying after - * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ - retval = try_to_freeze_cgroup(cgroup, freezer); - break; - } - /* state == FREEZING and goal_state == THAWED, so unfreeze */ case CGROUP_FROZEN: - unfreeze_cgroup(cgroup, freezer); + retval = try_to_freeze_cgroup(cgroup, freezer); break; default: - break; + BUG(); } out: spin_unlock_irq(&freezer->lock); -- cgit v1.1 From ce05fcc30ea41c85f9d50bee1ce289f7cb7fb223 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 29 Oct 2008 14:01:07 -0700 Subject: kernel/profile: fix profile_init() section mismatch profile_init() calls in to alloc_bootmem() on early initialization. While alloc_bootmem() is __init, the reference itself is safe in that it is tucked below a !slab_is_available() check. So, flag profile_init() as __ref. Signed-off-by: Paul Mundt Cc: Dave Hansen Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index a9e422d..9830a03 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -102,7 +102,7 @@ int profile_setup(char *str) __setup("profile=", profile_setup); -int profile_init(void) +int __ref profile_init(void) { int buffer_bytes; if (!prof_on) -- cgit v1.1 From d25141a818383b3c3b09f065698c544a7a0ec6e7 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 29 Oct 2008 14:01:11 -0700 Subject: 'kill sig -1' must only apply to caller's namespace Currently "kill -1" kills processes in all namespaces and breaks the isolation of namespaces. Earlier attempt to fix this was discussed at: http://lkml.org/lkml/2008/7/23/148 As suggested by Oleg Nesterov in that thread, use "task_pid_vnr() > 1" check since task_pid_vnr() returns 0 if process is outside the caller's namespace. Signed-off-by: Sukadev Bhattiprolu Acked-by: Eric W. Biederman Tested-by: Daniel Hokka Zakrisson Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 105217d..4530fc6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1144,7 +1144,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) struct task_struct * p; for_each_process(p) { - if (p->pid > 1 && !same_thread_group(p, current)) { + if (task_pid_vnr(p) > 1 && + !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); ++count; if (err != -EPERM) -- cgit v1.1 From 9244489a7b69fe0746dc7cb3957f02e05bd1ceb0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Oct 2008 09:42:59 -0400 Subject: ftrace: handle archs that do not support irqs_disabled_flags Impact: build fix on non-lockdep architectures Some architectures do not support a way to read the irq flags that is set from "local_irq_save(flags)" to determine if interrupts were disabled or enabled. Ftrace uses this information to display to the user if the trace occurred with interrupts enabled or disabled. Besides the fact that those archs that do not support this will fail to compile, unless they fix it, we do not want to have the trace simply say interrupts were not disabled or they were enabled, without knowing the real answer. This patch adds a 'X' in the output to let the user know that the architecture they are running on does not support a way for the tracer to determine if interrupts were enabled or disabled. It also lets those same archs compile with tracing enabled. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 7 ++++++- kernel/trace/trace.h | 20 +++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a610ca7..8a499e2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -656,7 +656,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; entry->flags = +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | +#else + TRACE_FLAG_IRQS_NOSUPPORT | +#endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); @@ -1244,7 +1248,8 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); trace_seq_printf(s, "%3d", cpu); trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6889ca4..8465ad0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -120,18 +120,20 @@ struct trace_boot { /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * NEED_RESCED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - * CONT - multiple entries hold the trace item + * IRQS_OFF - interrupts were disabled + * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + * CONT - multiple entries hold the trace item */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_NEED_RESCHED = 0x02, - TRACE_FLAG_HARDIRQ = 0x04, - TRACE_FLAG_SOFTIRQ = 0x08, - TRACE_FLAG_CONT = 0x10, + TRACE_FLAG_IRQS_NOSUPPORT = 0x02, + TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_CONT = 0x20, }; #define TRACE_BUF_SIZE 1024 -- cgit v1.1 From 42c0202363194007a1ac377d047a95aa39246eb0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Nov 2008 09:53:58 -0700 Subject: reserve_region_with_split: Fix GFP_KERNEL usage under spinlock This one apparently doesn't generate any warnings, because the function is only used during system bootup, when the warnings are disabled. But it's still very wrong. The __reserve_region_with_split() function is called with the resource_lock held for writing, so it must only ever do GFP_ATOMIC allocations. Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 6aac5c6..4337063 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -523,7 +523,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); if (!res) return; -- cgit v1.1 From 28959742c14918f153c1de641bb12b4ea44315a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Nov 2008 18:20:09 +0000 Subject: PM_TEST_SUSPEND should depend on RTC_CLASS, not RTC_LIB Insufficient dependency - we really want CONFIG_RTC_CLASS=y there. That will give us CONFIG_RTC_LIB=y, so the old dependency can be simply replaced. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index dcd165f..23bd4da 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -96,7 +96,7 @@ config SUSPEND config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" - depends on SUSPEND && PM_DEBUG && RTC_LIB=y + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y ---help--- This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. -- cgit v1.1 From c2c80529460095035752bf0ecc1af82c1e0f6e0f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 Oct 2008 19:50:41 +0000 Subject: tracing, alpha: undefined reference to `save_stack_trace' Impact: build fix on !stacktrace architectures only select STACKTRACE on architectures that have STACKTRACE_SUPPORT ... since we also need to ifdef out the guts of ftrace_trace_stack(). We also want to disallow setting TRACE_ITER_STACKTRACE in trace_flags on such configs, but that can wait. Signed-off-by: Al Viro Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 +- kernel/trace/trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b58f43b..33dbefd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -25,7 +25,7 @@ config TRACING bool select DEBUG_FS select RING_BUFFER - select STACKTRACE + select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS select NOP_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a499e2..85bee77 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -705,6 +705,7 @@ static void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { +#ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -730,6 +731,7 @@ static void ftrace_trace_stack(struct trace_array *tr, save_stack_trace(&trace); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +#endif } void __trace_stack(struct trace_array *tr, -- cgit v1.1 From b3aa557722b3d5858f14ca559e03461c24125aaf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 15:44:07 -0400 Subject: ftrace: use kretprobe trampoline name to test in output Impact: ia64+tracing build fix When a function is kprobed, the return address is set to the kprobe_trampoline, or something similar. This caused the output of the trace to look confusing when the parent seemed to be this "kprobe_trampoline" function. To fix this, Abhishek Sagar added a test of the instruction pointer of the parent to see if it matched the kprobe_trampoline. If it did, the output would print a "[unknown/kretprobe'd]" instead. Unfortunately, not all archs do this the same way, and the trampoline function may not be exported, which causes failures in builds. This patch will compare the name instead of the pointer to see if it matches. This prevents us from depending on a function from being exported, and should work on all archs. The worst that can happen is that an arch might use a different name and then we go back to the confusing output. At least the arch will still build. Reported-by: Abhishek Sagar Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Tested-by: Abhishek Sagar Acked-by: Abhishek Sagar --- kernel/trace/trace.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85bee77..9f3b478 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1088,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } -#define KRETPROBE_MSG "[unknown/kretprobe'd]" - #ifdef CONFIG_KRETPROBES -static inline int kretprobed(unsigned long addr) +static inline const char *kretprobed(const char *name) { - return addr == (unsigned long)kretprobe_trampoline; + static const char tramp_name[] = "kretprobe_trampoline"; + int size = sizeof(tramp_name); + + if (strncmp(tramp_name, name, size) == 0) + return "[unknown/kretprobe'd]"; + return name; } #else -static inline int kretprobed(unsigned long addr) +static inline const char *kretprobed(const char *name) { - return 0; + return name; } #endif /* CONFIG_KRETPROBES */ @@ -1107,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; + const char *name; kallsyms_lookup(address, NULL, NULL, NULL, str); - return trace_seq_printf(s, fmt, str); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); #endif return 1; } @@ -1121,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; + const char *name; sprint_symbol(str, address); - return trace_seq_printf(s, fmt, str); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); #endif return 1; } @@ -1377,10 +1386,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_puts(s, " ("); - if (kretprobed(field->parent_ip)) - trace_seq_puts(s, KRETPROBE_MSG); - else - seq_print_ip_sym(s, field->parent_ip, sym_flags); + seq_print_ip_sym(s, field->parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; } @@ -1496,12 +1502,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) ret = trace_seq_printf(s, " <-"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (kretprobed(field->parent_ip)) - ret = trace_seq_puts(s, KRETPROBE_MSG); - else - ret = seq_print_ip_sym(s, - field->parent_ip, - sym_flags); + ret = seq_print_ip_sym(s, + field->parent_ip, + sym_flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } -- cgit v1.1 From 818e3dd30a4ff34fff6d90e87ae59c73f6a53691 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 09:58:35 -0400 Subject: tracing, ring-buffer: add paranoid checks for loops While writing a new tracer, I had a bug where I caused the ring-buffer to recurse in a bad way. The bug was with the tracer I was writing and not the ring-buffer itself. But it took a long time to find the problem. This patch adds paranoid checks into the ring-buffer infrastructure that will catch bugs of this nature. Note: I put the bug back in the tracer and this patch showed the error nicely and prevented the lockup. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cedf4e2..3f33806 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1022,8 +1022,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; u64 ts, delta; int commit = 0; + int nr_loops = 0; again: + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (unlikely(++nr_loops > 1000)) { + RB_WARN_ON(cpu_buffer, 1); + return NULL; + } + ts = ring_buffer_time_stamp(cpu_buffer->cpu); /* @@ -1532,10 +1547,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long flags; + int nr_loops = 0; spin_lock_irqsave(&cpu_buffer->lock, flags); again: + /* + * This should normally only loop twice. But because the + * start of the reader inserts an empty page, it causes + * a case where we will loop three times. There should be no + * reason to loop four times (that I know of). + */ + if (unlikely(++nr_loops > 3)) { + RB_WARN_ON(cpu_buffer, 1); + reader = NULL; + goto out; + } + reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ @@ -1665,6 +1693,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; + int nr_loops = 0; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; @@ -1672,6 +1701,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) cpu_buffer = buffer->buffers[cpu]; again: + /* + * We repeat when a timestamp is encountered. It is possible + * to get multiple timestamps from an interrupt entering just + * as one timestamp is about to be written. The max times + * that this can happen is the number of nested interrupts we + * can have. Nesting 10 deep of interrupts is clearly + * an anomaly. + */ + if (unlikely(++nr_loops > 10)) { + RB_WARN_ON(cpu_buffer, 1); + return NULL; + } + reader = rb_get_reader_page(cpu_buffer); if (!reader) return NULL; @@ -1722,6 +1764,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; + int nr_loops = 0; if (ring_buffer_iter_empty(iter)) return NULL; @@ -1730,6 +1773,19 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) buffer = cpu_buffer->buffer; again: + /* + * We repeat when a timestamp is encountered. It is possible + * to get multiple timestamps from an interrupt entering just + * as one timestamp is about to be written. The max times + * that this can happen is the number of nested interrupts we + * can have. Nesting 10 deep of interrupts is clearly + * an anomaly. + */ + if (unlikely(++nr_loops > 10)) { + RB_WARN_ON(cpu_buffer, 1); + return NULL; + } + if (rb_per_cpu_empty(cpu_buffer)) return NULL; -- cgit v1.1 From 072ba49838b42c873c496d72c91bb237914cf3b6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sun, 26 Oct 2008 15:26:57 -0700 Subject: ftrace: fix breakage in bin_fmt results In 777e208d40d0953efc6fb4ab58590da3f7d8f02d we changed from outputting field->cpu (a char) to iter->cpu (unsigned int), increasing the resulting structure size by 3 bytes. Signed-off-by: Eric Anholt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478..974973e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1755,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); + SEQ_PUT_FIELD_RET(s, entry->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { -- cgit v1.1 From f4b6755fb37595da3630d1d6fc130ea6888cd48f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:07 +0100 Subject: sched: cleanup fair task selection Impact: cleanup Clean up task selection Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ce514af..6167336 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -347,17 +347,17 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rb_leftmost; -} - static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) { - return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); + struct rb_node *left = cfs_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_entity, run_node); } -static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -794,28 +794,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); -static struct sched_entity * -pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1) return se; return cfs_rq->next; } -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = NULL; - - if (first_fair(cfs_rq)) { - se = __pick_next_entity(cfs_rq); - se = pick_next(cfs_rq, se); - set_next_entity(cfs_rq, se); - } - - return se; -} - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -1396,6 +1384,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.1 From d95f98d0691d3aba5e35850011946a08c9b36428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:08 +0100 Subject: sched: fix fair preempt check Impact: fix cross-class preemption Inter-class wakeup preemptions should go on class order. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6167336..ebd6de8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1329,6 +1329,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } + if (unlikely(p->sched_class != &fair_sched_class)) + return; + if (unlikely(se == pse)) return; -- cgit v1.1 From 4793241be408b3926ee00c704d7da3b3faf3a05f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:09 +0100 Subject: sched: backward looking buddy Impact: improve/change/fix wakeup-buddy scheduling Currently we only have a forward looking buddy, that is, we prefer to schedule to the task we last woke up, under the presumption that its going to consume the data we just produced, and therefore will have cache hot benefits. This allows co-waking producer/consumer task pairs to run ahead of the pack for a little while, keeping their cache warm. Without this, we would interleave all pairs, utterly trashing the cache. This patch introduces a backward looking buddy, that is, suppose that in the above scenario, the consumer preempts the producer before it can go to sleep, we will therefore miss the wakeup from consumer to producer (its already running, after all), breaking the cycle and reverting to the cache-trashing interleaved schedule pattern. The backward buddy will try to schedule back to the task that woke us up in case the forward buddy is not available, under the assumption that the last task will be the one with the most cache hot task around barring current. This will basically allow a task to continue after it got preempted. In order to avoid starvation, we allow either buddy to get wakeup_gran ahead of the pack. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- kernel/sched_fair.c | 32 +++++++++++++++++++++++++------- kernel/sched_features.h | 1 + 3 files changed, 30 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc..82cc839 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -397,7 +397,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next; + struct sched_entity *curr, *next, *last; unsigned long nr_spread_over; @@ -1805,7 +1805,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) + if (sched_feat(CACHE_HOT_BUDDY) && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) return 1; if (p->sched_class != &fair_sched_class) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ebd6de8..a6b1db8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -341,9 +341,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->rb_leftmost = next_node; } - if (cfs_rq->next == se) - cfs_rq->next = NULL; - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -741,6 +738,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) #endif } + if (cfs_rq->last == se) + cfs_rq->last = NULL; + + if (cfs_rq->next == se) + cfs_rq->next = NULL; + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); @@ -798,10 +801,13 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1) - return se; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) + return cfs_rq->next; - return cfs_rq->next; + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) + return cfs_rq->last; + + return se; } static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) @@ -1319,10 +1325,11 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; if (unlikely(rt_prio(p->prio))) { + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + update_rq_clock(rq); update_curr(cfs_rq); resched_task(curr); @@ -1335,6 +1342,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (unlikely(se == pse)) return; + /* + * Only set the backward buddy when the current task is still on the + * rq. This can happen when a wakeup gets interleaved with schedule on + * the ->pre_schedule() or idle_balance() point, either of which can + * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, for + * obvious reasons its a bad idea to schedule back to the idle thread. + */ + if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) + cfs_rq_of(se)->last = se; cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/sched_features.h b/kernel/sched_features.h index fda0162..da5d93b 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) SCHED_FEAT(WAKEUP_OVERLAP, 0) +SCHED_FEAT(LAST_BUDDY, 1) -- cgit v1.1 From 02479099c286894644f8e96c6bbb535ab64662fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:10 +0100 Subject: sched: fix buddies for group scheduling Impact: scheduling order fix for group scheduling For each level in the hierarchy, set the buddy to point to the right entity. Therefore, when we do the hierarchical schedule, we have a fair chance of ending up where we meant to. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a6b1db8..51aa3e1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1319,6 +1319,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) return 0; } +static void set_last_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; +} + +static void set_next_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1352,8 +1364,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) * obvious reasons its a bad idea to schedule back to the idle thread. */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - cfs_rq_of(se)->last = se; - cfs_rq_of(pse)->next = pse; + set_last_buddy(se); + set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task -- cgit v1.1 From 561920a0d2bb6d63343e83acfd784c0a77bd28d1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 30 Oct 2008 18:28:41 +0100 Subject: generic-ipi: fix the smp_mb() placement smp_mb() is needed (to make the memory operations visible globally) before sending the ipi on the sender and the receiver (on Alpha atleast) needs smp_read_barrier_depends() in the handler before reading the call_single_queue list in a lock-free fashion. On x86, x2apic mode register accesses for sending IPI's don't have serializing semantics. So the need for smp_mb() before sending the IPI becomes more critical in x2apic mode. Remove the unnecessary smp_mb() in csd_flag_wait(), as the presence of that smp_mb() doesn't mean anything on the sender, when the ipi receiver is not doing any thing special (like memory fence) after clearing the CSD_FLAG_WAIT. Signed-off-by: Suresh Siddha Signed-off-by: Jens Axboe --- kernel/smp.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index f362a85..75c8dde 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data) { /* Wait for response */ do { - /* - * We need to see the flags store in the IPI handler - */ - smp_mb(); if (!(data->flags & CSD_FLAG_WAIT)) break; cpu_relax(); @@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data) list_add_tail(&data->list, &dst->list); spin_unlock_irqrestore(&dst->lock, flags); + /* + * Make the list addition visible before sending the ipi. + */ + smp_mb(); + if (ipi) arch_send_call_function_single_ipi(cpu); @@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void) * Need to see other stores to list head for checking whether * list is empty without holding q->lock */ - smp_mb(); + smp_read_barrier_depends(); while (!list_empty(&q->list)) { unsigned int data_flags; @@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void) /* * See comment on outer loop */ - smp_mb(); + smp_read_barrier_depends(); } } @@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, list_add_tail_rcu(&data->csd.list, &call_function_queue); spin_unlock_irqrestore(&call_function_lock, flags); + /* + * Make the list addition visible before sending the ipi. + */ + smp_mb(); + /* Send a message to all CPUs in the map */ arch_send_call_function_ipi(mask); -- cgit v1.1 From 9c133c469d38043d5aadaa03f2fb840d88d1cf4f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 6 Nov 2008 08:42:48 +0100 Subject: Add round_jiffies_up and related routines This patch (as1158b) adds round_jiffies_up() and friends. These routines work like the analogous round_jiffies() functions, except that they will never round down. The new routines will be useful for timeouts where we don't care exactly when the timer expires, provided it doesn't expire too soon. Signed-off-by: Alan Stern Signed-off-by: Jens Axboe --- kernel/timer.c | 129 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 99 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 56becf3..dbd50fa 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base) tbase_get_deferrable(timer->base)); } -/** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) { int rem; unsigned long original = j; @@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu) * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. */ - if (rem < HZ/4) /* round down */ + if (rem < HZ/4 && !force_up) /* round down */ j = j - rem; else /* round up */ j = j - rem + HZ; @@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu) return original; return j; } + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} EXPORT_SYMBOL_GPL(__round_jiffies); /** @@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies); */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { - /* - * In theory the following code can skip a jiffy in case jiffies - * increments right between the addition and the later subtraction. - * However since the entire point of this function is to use approximate - * timeouts, it's entirely ok to not handle that. - */ - return __round_jiffies(j + jiffies, cpu) - jiffies; + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_relative); @@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); */ unsigned long round_jiffies(unsigned long j) { - return __round_jiffies(j, raw_smp_processor_id()); + return round_jiffies_common(j, raw_smp_processor_id(), false); } EXPORT_SYMBOL_GPL(round_jiffies); @@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_relative); +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) -- cgit v1.1 From 2d3854a37e8b767a51aba38ed6d22817b0631e33 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 5 Nov 2008 13:39:10 +1100 Subject: cpumask: introduce new API, without changing anything Impact: introduce new APIs We want to deprecate cpumasks on the stack, as we are headed for gynormous numbers of CPUs. Eventually, we want to head towards an undefined 'struct cpumask' so they can never be declared on stack. 1) New cpumask functions which take pointers instead of copies. (cpus_* -> cpumask_*) 2) Several new helpers to reduce requirements for temporary cpumasks (cpumask_first_and, cpumask_next_and, cpumask_any_and) 3) Helpers for declaring cpumasks on or offstack for large NR_CPUS (cpumask_var_t, alloc_cpumask_var and free_cpumask_var) 4) 'struct cpumask' for explicitness and to mark new-style code. 5) Make iterator functions stop at nr_cpu_ids (a runtime constant), not NR_CPUS for time efficiency and for smaller dynamic allocations in future. 6) cpumask_copy() so we can allocate less than a full cpumask eventually (for alloc_cpumask_var), and so we can eliminate the 'struct cpumask' definition eventually. 7) work_on_cpu() helper for doing task on a CPU, rather than saving old cpumask for current thread and manipulating it. 8) smp_call_function_many() which is smp_call_function_mask() except taking a cpumask pointer. Note that this patch simply introduces the new functions and leaves the obsolescent ones in place. This is to simplify the transition patches. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/cpu.c | 3 +++ kernel/workqueue.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 86d4904..5a732c5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { #endif }; EXPORT_SYMBOL_GPL(cpu_bit_bitmap); + +const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; +EXPORT_SYMBOL(cpu_all_bits); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f928f2a..d4dc69d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -970,6 +970,51 @@ undo: return ret; } +#ifdef CONFIG_SMP +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void do_work_for_cpu(struct work_struct *w) +{ + struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return -EINVAL in the cpu is not online, or the return value + * of @fn otherwise. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc; + + INIT_WORK(&wfc.work, do_work_for_cpu); + wfc.fn = fn; + wfc.arg = arg; + get_online_cpus(); + if (unlikely(!cpu_online(cpu))) + wfc.ret = -EINVAL; + else { + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + } + put_online_cpus(); + + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + void __init init_workqueues(void) { cpu_populated_map = cpu_online_map; -- cgit v1.1 From 24eb089950ce44603b30a3145a2c8520e2b55bb1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 6 Nov 2008 12:53:32 -0800 Subject: cgroups: fix invalid cgrp->dentry before cgroup has been completely removed This fixes an oops when reading /proc/sched_debug. A cgroup won't be removed completely until finishing cgroup_diput(), so we shouldn't invalidate cgrp->dentry in cgroup_rmdir(). Otherwise, when a group is being removed while cgroup_path() gets called, we may trigger NULL dereference BUG. The bug can be reproduced: # cat test.sh #!/bin/sh mount -t cgroup -o cpu xxx /mnt for (( ; ; )) { mkdir /mnt/sub rmdir /mnt/sub } # ./test.sh & # cat /proc/sched_debug BUG: unable to handle kernel NULL pointer dereference at 00000038 IP: [] cgroup_path+0x39/0x90 ... Call Trace: [] ? print_cfs_rq+0x6e/0x75d [] ? sched_debug_show+0x72d/0xc1e ... Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Peter Zijlstra Cc: Ingo Molnar Cc: [2.6.26.x, 2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35eebd5..358e775 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2497,7 +2497,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) list_del(&cgrp->sibling); spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); - cgrp->dentry = NULL; spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); -- cgit v1.1 From f29c9b1ccb52904ee442a933cf3dee628f9f4e62 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 6 Nov 2008 09:45:16 +0800 Subject: sched: fix a bug in sched domain degenerate Impact: re-add incorrectly eliminated sched domain layers (1) on i386 with SCHED_SMT and SCHED_MC enabled # mount -t cgroup -o cpuset xxx /mnt # echo 0 > /mnt/cpuset.sched_load_balance # mkdir /mnt/0 # echo 0 > /mnt/0/cpuset.cpus # dmesg CPU0 attaching sched-domain: domain 0: span 0 level CPU groups: 0 (2) on i386 with SCHED_MC enabled but SCHED_SMT disabled # same with (1) # dmesg CPU0 attaching NULL sched-domain. The bug is that some sched domains may be skipped unintentionally when degenerating (optimizing) sched domains. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 82cc839..4c7e2bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6877,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; tmp = tmp->parent) { + for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break; + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; - } + } else + tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { -- cgit v1.1 From ca3273f9646694e0419cfb9d6c12deb1c9aff27c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Nov 2008 14:47:21 +0800 Subject: sched: fix memory leak in a failure path Impact: fix rare memory leak in the sched-domains manual reconfiguration code In the failure path, rd is not attached to a sched domain, so it causes a leak. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4c7e2bc..57c933f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7676,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, error: free_sched_groups(cpu_map, tmpmask); SCHED_CPUMASK_FREE((void *)allmasks); + kfree(rd); return -ENOMEM; #endif } -- cgit v1.1 From 5ac5c4d604bf894ef672a7971d03fefdc7ea7e49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2008 10:46:32 +0100 Subject: sched: clean up debug info Impact: clean up and fix debug info printout While looking over the sched_debug code I noticed that we printed the rq schedstats for every cfs_rq, ammend this. Also change nr_spead_over into an int, and fix a little buglet in min_vruntime printing. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_debug.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 57c933f..f314924 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -399,7 +399,7 @@ struct cfs_rq { */ struct sched_entity *curr, *next, *last; - unsigned long nr_spread_over; + unsigned int nr_spread_over; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae1776..48ecc51 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; - min_vruntime = rq->cfs.min_vruntime; + min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", @@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - - P(yld_exp_empty); - P(yld_act_empty); - P(yld_both_empty); - P(yld_count); - P(sched_switch); - P(sched_count); - P(sched_goidle); - - P(ttwu_count); - P(ttwu_local); - - P(bkl_count); - -#undef P -#endif - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #undef PN +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P +#endif print_cfs_stats(m, cpu); print_rt_stats(m, cpu); -- cgit v1.1 From ee5f80a993539490a07477ff2526bf62c503fbb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 11:06:00 +0100 Subject: irq: call __irq_enter() before calling the tick_idle_check Impact: avoid spurious ksoftirqd wakeups The tick idle check which is called from irq_enter() was run before the call to __irq_enter() which did not set the in_interrupt() bits in preempt_count. That way the raise of a softirq woke up softirqd for nothing as the softirq was handled on return from interrupt. Call __irq_enter() before calling into the tick idle check code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/softirq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 7110dae..e7c69a7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -269,10 +269,11 @@ void irq_enter(void) { int cpu = smp_processor_id(); - if (idle_cpu(cpu) && !in_interrupt()) + if (idle_cpu(cpu) && !in_interrupt()) { + __irq_enter(); tick_check_idle(cpu); - - __irq_enter(); + } else + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -- cgit v1.1 From ae99286b4f1be7788f2d6947c66a91dbd6351eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Nov 2008 13:20:23 +0100 Subject: nohz: disable tick_nohz_kick_tick() for now Impact: nohz powersavings and wakeup regression commit fb02fbc14d17837b4b7b02dbb36142c16a7bf208 (NOHZ: restart tick device from irq_enter()) causes a serious wakeup regression. While the patch is correct it does not take into account that spurious wakeups happen on x86. A fix for this issue is available, but we just revert to the .27 behaviour and let long running softirqs screw themself. Disable it for now. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5bbb104..342fc9c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void) */ static void tick_nohz_kick_tick(int cpu) { +#if 0 + /* Switch back to 2.6.27 behaviour */ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta, now; @@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu) return; tick_nohz_restart(ts, now); +#endif } #else -- cgit v1.1 From bf5e6519b85b3853f2d0bb4f17a4e2eaeffeb574 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 21:46:00 -0500 Subject: ftrace: disable tracing on resize Impact: fix for bug on resize This patch addresses the bug found here: http://bugzilla.kernel.org/show_bug.cgi?id=11996 When ftrace converted to the new unified trace buffer, the resizing of the buffer was not protected as much as it was originally. If tracing is performed while the resize occurs, then the buffer can be corrupted. This patch disables all ftrace buffer modifications before a resize takes place. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478..abfa810 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2676,7 +2676,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret; + int ret, cpu; struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) @@ -2704,6 +2704,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, goto out; } + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); if (ret < 0) { @@ -2735,6 +2743,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) cnt = -ENOMEM; out: + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); -- cgit v1.1 From 4143c5cb36331155a1823af8b3a8c761a59fed71 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 21:46:01 -0500 Subject: ring-buffer: prevent infinite looping on time stamping Impact: removal of unnecessary looping The lockless part of the ring buffer allows for reentry into the code from interrupts. A timestamp is taken, a test is preformed and if it detects that an interrupt occurred that did tracing, it tries again. The problem arises if the timestamp code itself causes a trace. The detection will detect this and loop again. The difference between this and an interrupt doing tracing, is that this will fail every time, and cause an infinite loop. Currently, we test if the loop happens 1000 times, and if so, it will produce a warning and disable the ring buffer. The problem with this approach is that it makes it difficult to perform some types of tracing (tracing the timestamp code itself). Each trace entry has a delta timestamp from the previous entry. If a trace entry is reserved but and interrupt occurs and traces before the previous entry is commited, the delta timestamp for that entry will be zero. This actually makes sense in terms of tracing, because the interrupt entry happened before the preempted entry was commited, so one may consider the two happening at the same time. The order is still preserved in the buffer. With this idea, instead of trying to get a new timestamp if an interrupt made it in between the timestamp and the test, the entry could simply make the delta zero and continue. This will prevent interrupts or tracers in the timer code from causing the above loop. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3f33806..2f76193 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1060,7 +1060,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - goto again; + delta = 0; if (test_time_stamp(delta)) { -- cgit v1.1 From ad474caca3e2a0550b7ce0706527ad5ab389a4d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 10 Nov 2008 15:39:30 +0100 Subject: fix for account_group_exec_runtime(), make sure ->signal can't be freed under rq->lock Impact: fix hang/crash on ia64 under high load This is ugly, but the simplest patch by far. Unlike other similar routines, account_group_exec_runtime() could be called "implicitly" from within scheduler after exit_notify(). This means we can race with the parent doing release_task(), we can't just check ->signal != NULL. Change __exit_signal() to do spin_unlock_wait(&task_rq(tsk)->lock) before __cleanup_signal() to make sure ->signal can't be freed under task_rq(tsk)->lock. Note that task_rq_unlock_wait() doesn't care about the case when tsk changes cpu/rq under us, this should be OK. Thanks to Ingo who nacked my previous buggy patch. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar Reported-by: Doug Chapman --- kernel/exit.c | 5 +++++ kernel/sched.c | 8 ++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 80137a5..ae2b92b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -141,6 +141,11 @@ static void __exit_signal(struct task_struct *tsk) if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } diff --git a/kernel/sched.c b/kernel/sched.c index f314924..50a21f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -969,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void task_rq_unlock_wait(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ + spin_unlock_wait(&rq->lock); +} + static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { -- cgit v1.1 From 5d5254f0d3b9bebc47d97e357374c0ad0c291a7d Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sat, 25 Oct 2008 10:22:38 +0530 Subject: timers: handle HRTIMER_CB_IRQSAFE_UNLOCKED correctly from softirq context Impact: fix incorrect locking triggered during hotplug-intense stress-tests While migrating the the CB_IRQSAFE_UNLOCKED timers during a cpu-offline, we queue them on the cb_pending list, so that they won't go stale. Thus, when the callbacks of the timers run from the softirq context, they could run into potential deadlocks, since these callbacks assume that they're running with irq's disabled, thereby annoying lockdep! Fix this by emulating hardirq context while running these callbacks from the hrtimer softirq. ================================= [ INFO: inconsistent lock state ] 2.6.27 #2 -------------------------------- inconsistent {in-hardirq-W} -> {hardirq-on-W} usage. ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes: (&rq->lock){++..}, at: [] sched_rt_period_timer+0x9e/0x1fc {in-hardirq-W} state was registered at: [] __lock_acquire+0x549/0x121e [] native_sched_clock+0x88/0x99 [] clocksource_get_next+0x39/0x3f [] update_wall_time+0x616/0x7df [] lock_acquire+0x5a/0x74 [] scheduler_tick+0x3a/0x18d [] _spin_lock+0x1c/0x45 [] scheduler_tick+0x3a/0x18d [] scheduler_tick+0x3a/0x18d [] update_process_times+0x3a/0x44 [] tick_periodic+0x63/0x6d [] tick_handle_periodic+0x14/0x5e [] timer_interrupt+0x44/0x4a [] handle_IRQ_event+0x13/0x3d [] handle_level_irq+0x79/0xbd [] do_IRQ+0x69/0x7d [] common_interrupt+0x28/0x30 [] aac_probe_one+0x1a3/0x3f3 [] _spin_unlock_irqrestore+0x36/0x39 [] setup_irq+0x1be/0x1f9 [] start_kernel+0x259/0x2c5 [] 0xffffffff irq event stamp: 50102 hardirqs last enabled at (50102): [] _spin_unlock_irq+0x20/0x23 hardirqs last disabled at (50101): [] _spin_lock_irq+0xa/0x4b softirqs last enabled at (50088): [] do_softirq+0x37/0x4d softirqs last disabled at (50099): [] do_softirq+0x37/0x4d other info that might help us debug this: no locks held by ksoftirqd/0/4. stack backtrace: Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27 #2 [] print_usage_bug+0x13e/0x147 [] mark_lock+0x493/0x797 [] __lock_acquire+0x5be/0x121e [] lock_acquire+0x5a/0x74 [] sched_rt_period_timer+0x9e/0x1fc [] _spin_lock+0x1c/0x45 [] sched_rt_period_timer+0x9e/0x1fc [] sched_rt_period_timer+0x9e/0x1fc [] finish_task_switch+0x41/0xbd [] native_sched_clock+0x88/0x99 [] sched_rt_period_timer+0x0/0x1fc [] run_hrtimer_pending+0x54/0xe5 [] sched_rt_period_timer+0x0/0x1fc [] __do_softirq+0x7b/0xef [] do_softirq+0x37/0x4d [] ksoftirqd+0x56/0xc5 [] ksoftirqd+0x0/0xc5 [] kthread+0x38/0x5d [] kthread+0x0/0x5d [] kernel_thread_helper+0x7/0x10 ======================= Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Acked-by: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2b465df..95d3949 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1209,6 +1209,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) enum hrtimer_restart (*fn)(struct hrtimer *); struct hrtimer *timer; int restart; + int emulate_hardirq_ctx = 0; timer = list_entry(cpu_base->cb_pending.next, struct hrtimer, cb_entry); @@ -1217,10 +1218,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) timer_stats_account_hrtimer(timer); fn = timer->function; + /* + * A timer might have been added to the cb_pending list + * when it was migrated during a cpu-offline operation. + * Emulate hardirq context for such timers. + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) + emulate_hardirq_ctx = 1; + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); spin_unlock_irq(&cpu_base->lock); - restart = fn(timer); + if (unlikely(emulate_hardirq_ctx)) { + local_irq_disable(); + restart = fn(timer); + local_irq_enable(); + } else + restart = fn(timer); spin_lock_irq(&cpu_base->lock); -- cgit v1.1 From 2002c69595a092518107f7e3c1294c9710bc92ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Nov 2008 11:52:33 +0100 Subject: sched: release buddies on yield Clear buddies on yield, so that the buddy rules don't schedule them despite them being placed right-most. This fixed a performance regression with yield-happy binary JVMs. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Tested-by: Lin Ming --- kernel/sched_fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 51aa3e1..98345e4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -716,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->last == se) + cfs_rq->last = NULL; + + if (cfs_rq->next == se) + cfs_rq->next = NULL; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -738,11 +747,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) #endif } - if (cfs_rq->last == se) - cfs_rq->last = NULL; - - if (cfs_rq->next == se) - cfs_rq->next = NULL; + clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); @@ -977,6 +982,8 @@ static void yield_task_fair(struct rq *rq) if (unlikely(cfs_rq->nr_running == 1)) return; + clear_buddies(cfs_rq, se); + if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* -- cgit v1.1