From 412d0bb553c0227191f1bfd06100f561600bff22 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 24 Dec 2008 01:43:25 +0100
Subject: tracing/function-graph-tracer: strip ending newlines on comments

Impact: tracer output improvement

Ending newlines are appended automatically on comments by the function
graph tracer because the newline needs to be placed after the "*/"
comment characters.

So if the user puts an ending newline, we want to strip it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fc..bc7d908 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -592,6 +592,12 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	if (ent->flags & TRACE_FLAG_CONT)
 		trace_seq_print_cont(s, iter);
 
+	/* Strip ending newline */
+	if (s->buffer[s->len - 1] == '\n') {
+		s->buffer[s->len - 1] = '\0';
+		s->len--;
+	}
+
 	ret = trace_seq_printf(s, " */\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.1


From c47956d9ae3341d2d1998bff26620fa3338c01e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:11 -0500
Subject: ftrace: remove obsolete print continue functionality

Impact: cleanup, remove obsolete code

Now that the ring buffer used by ftrace allows for variable length
entries, we do not need the 'cont' feature of the buffer.  This code
makes other parts of ftrace more complex and by removing this it
simplifies the ftrace code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 61 ------------------------------------
 kernel/trace/trace.h                 |  7 -----
 kernel/trace/trace_functions_graph.c |  3 --
 kernel/trace/trace_mmiotrace.c       |  3 --
 4 files changed, 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4bb380..fca0233 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1765,43 +1765,6 @@ static int task_state_char(unsigned long state)
 	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
 }
 
-/*
- * The message is supposed to contain an ending newline.
- * If the printing stops prematurely, try to add a newline of our own.
- */
-void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
-{
-	struct trace_entry *ent;
-	struct trace_field_cont *cont;
-	bool ok = true;
-
-	ent = peek_next_entry(iter, iter->cpu, NULL);
-	if (!ent || ent->type != TRACE_CONT) {
-		trace_seq_putc(s, '\n');
-		return;
-	}
-
-	do {
-		cont = (struct trace_field_cont *)ent;
-		if (ok)
-			ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
-
-		ftrace_disable_cpu();
-
-		if (iter->buffer_iter[iter->cpu])
-			ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-		else
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
-
-		ftrace_enable_cpu();
-
-		ent = peek_next_entry(iter, iter->cpu, NULL);
-	} while (ent && ent->type == TRACE_CONT);
-
-	if (!ok)
-		trace_seq_putc(s, '\n');
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1834,9 +1797,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	int S, T;
 	int i;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	test_cpu_buff_start(iter);
 
 	next_entry = find_next_entry(iter, NULL, &next_ts);
@@ -1922,8 +1882,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 
 		seq_print_ip_sym(s, field->ip, sym_flags);
 		trace_seq_printf(s, ": %s", field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	case TRACE_BRANCH: {
@@ -1968,9 +1926,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	test_cpu_buff_start(iter);
 
 	comm = trace_find_cmdline(iter->ent->pid);
@@ -2076,8 +2031,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 		seq_print_ip_sym(s, field->ip, sym_flags);
 		trace_seq_printf(s, ": %s", field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	case TRACE_GRAPH_RET: {
@@ -2124,9 +2077,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	ret = trace_seq_printf(s, "%d %d %llu ",
 		entry->pid, iter->cpu, iter->ts);
 	if (!ret)
@@ -2187,8 +2137,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	}
@@ -2217,9 +2165,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
@@ -2283,9 +2228,6 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (entry->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -2296,9 +2238,6 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	SEQ_PUT_FIELD_RET(s, entry->pid);
 	SEQ_PUT_FIELD_RET(s, entry->cpu);
 	SEQ_PUT_FIELD_RET(s, iter->ts);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f8..3a35738 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -16,7 +16,6 @@ enum trace_type {
 	TRACE_FN,
 	TRACE_CTX,
 	TRACE_WAKE,
-	TRACE_CONT,
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_SPECIAL,
@@ -178,7 +177,6 @@ struct trace_power {
  *  NEED_RESCED		- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
- *  CONT		- multiple entries hold the trace item
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
@@ -186,7 +184,6 @@ enum trace_flag_type {
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
-	TRACE_FLAG_CONT			= 0x20,
 };
 
 #define TRACE_BUF_SIZE		1024
@@ -262,7 +259,6 @@ extern void __ftrace_bad_type(void);
 	do {								\
 		IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN);	\
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
-		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
@@ -489,9 +485,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern void trace_seq_print_cont(struct trace_seq *s,
-				 struct trace_iterator *iter);
-
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index bc7d908..f261966 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -589,9 +589,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (ent->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	/* Strip ending newline */
 	if (s->buffer[s->len - 1] == '\n') {
 		s->buffer[s->len - 1] = '\0';
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb06..83f20ae 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -262,9 +262,6 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (entry->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	return TRACE_TYPE_HANDLED;
 }
 
-- 
cgit v1.1


From f0868d1e23a8efec33beb3aa688aab7fdb1ae093 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:12 -0500
Subject: ftrace: set up trace event hash infrastructure

Impact: simplify/generalize/refactor trace.c

The trace.c file is becoming more difficult to maintain due to the
growing number of events. There is several formats that an event may
be printed. This patch sets up the infrastructure of an event hash to
allow for events to register how they should be printed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile                |   1 +
 kernel/trace/trace.c                 | 275 +-------------------------
 kernel/trace/trace.h                 |   8 +-
 kernel/trace/trace_boot.c            |   1 +
 kernel/trace/trace_functions_graph.c |   1 +
 kernel/trace/trace_hw_branches.c     |   1 +
 kernel/trace/trace_mmiotrace.c       |   1 +
 kernel/trace/trace_output.c          | 365 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h          |  43 +++++
 kernel/trace/trace_power.c           |   1 +
 10 files changed, 416 insertions(+), 281 deletions(-)
 create mode 100644 kernel/trace/trace_output.c
 create mode 100644 kernel/trace/trace_output.h

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a9..549f93c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fca0233..90ce0c1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -38,6 +38,7 @@
 #include <linux/irqflags.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
 
@@ -330,132 +331,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
-	int len = (PAGE_SIZE - 1) - s->len;
-	va_list ap;
-	int ret;
-
-	if (!len)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
-	va_end(ap);
-
-	/* If we can't write it all, don't bother writing anything */
-	if (ret >= len)
-		return 0;
-
-	s->len += ret;
-
-	return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-static int
-trace_seq_puts(struct trace_seq *s, const char *str)
-{
-	int len = strlen(str);
-
-	if (len > ((PAGE_SIZE - 1) - s->len))
-		return 0;
-
-	memcpy(s->buffer + s->len, str, len);
-	s->len += len;
-
-	return len;
-}
-
-static int
-trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
-	if (s->len >= (PAGE_SIZE - 1))
-		return 0;
-
-	s->buffer[s->len++] = c;
-
-	return 1;
-}
-
-static int
-trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
-{
-	if (len > ((PAGE_SIZE - 1) - s->len))
-		return 0;
-
-	memcpy(s->buffer + s->len, mem, len);
-	s->len += len;
-
-	return len;
-}
-
-#define MAX_MEMHEX_BYTES	8
-#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-
-static int
-trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
-{
-	unsigned char hex[HEX_CHARS];
-	unsigned char *data = mem;
-	int i, j;
-
-#ifdef __BIG_ENDIAN
-	for (i = 0, j = 0; i < len; i++) {
-#else
-	for (i = len-1, j = 0; i >= 0; i--) {
-#endif
-		hex[j++] = hex_asc_hi(data[i]);
-		hex[j++] = hex_asc_lo(data[i]);
-	}
-	hex[j++] = ' ';
-
-	return trace_seq_putmem(s, hex, j);
-}
-
-static int
-trace_seq_path(struct trace_seq *s, struct path *path)
-{
-	unsigned char *p;
-
-	if (s->len >= (PAGE_SIZE - 1))
-		return 0;
-	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
-	if (!IS_ERR(p)) {
-		p = mangle_path(s->buffer + s->len, p, "\n");
-		if (p) {
-			s->len = p - s->buffer;
-			return 1;
-		}
-	} else {
-		s->buffer[s->len++] = '?';
-		return 1;
-	}
-
-	return 0;
-}
-
 static void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -1473,154 +1348,6 @@ static void s_stop(struct seq_file *m, void *p)
 	mutex_unlock(&trace_types_lock);
 }
 
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
-{
-	static const char tramp_name[] = "kretprobe_trampoline";
-	int size = sizeof(tramp_name);
-
-	if (strncmp(tramp_name, name, size) == 0)
-		return "[unknown/kretprobe'd]";
-	return name;
-}
-#else
-static inline const char *kretprobed(const char *name)
-{
-	return name;
-}
-#endif /* CONFIG_KRETPROBES */
-
-static int
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-	char str[KSYM_SYMBOL_LEN];
-	const char *name;
-
-	kallsyms_lookup(address, NULL, NULL, NULL, str);
-
-	name = kretprobed(str);
-
-	return trace_seq_printf(s, fmt, name);
-#endif
-	return 1;
-}
-
-static int
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
-		     unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-	char str[KSYM_SYMBOL_LEN];
-	const char *name;
-
-	sprint_symbol(str, address);
-	name = kretprobed(str);
-
-	return trace_seq_printf(s, fmt, name);
-#endif
-	return 1;
-}
-
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
-int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
-{
-	int ret;
-
-	if (!ip)
-		return trace_seq_printf(s, "0");
-
-	if (sym_flags & TRACE_ITER_SYM_OFFSET)
-		ret = seq_print_sym_offset(s, "%s", ip);
-	else
-		ret = seq_print_sym_short(s, "%s", ip);
-
-	if (!ret)
-		return 0;
-
-	if (sym_flags & TRACE_ITER_SYM_ADDR)
-		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
-	return ret;
-}
-
-static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-				    unsigned long ip, unsigned long sym_flags)
-{
-	struct file *file = NULL;
-	unsigned long vmstart = 0;
-	int ret = 1;
-
-	if (mm) {
-		const struct vm_area_struct *vma;
-
-		down_read(&mm->mmap_sem);
-		vma = find_vma(mm, ip);
-		if (vma) {
-			file = vma->vm_file;
-			vmstart = vma->vm_start;
-		}
-		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
-			if (ret)
-				ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
-		}
-		up_read(&mm->mmap_sem);
-	}
-	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
-		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
-	return ret;
-}
-
-static int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
-		      unsigned long sym_flags)
-{
-	struct mm_struct *mm = NULL;
-	int ret = 1;
-	unsigned int i;
-
-	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
-		struct task_struct *task;
-		/*
-		 * we do the lookup on the thread group leader,
-		 * since individual threads might have already quit!
-		 */
-		rcu_read_lock();
-		task = find_task_by_vpid(entry->ent.tgid);
-		if (task)
-			mm = get_task_mm(task);
-		rcu_read_unlock();
-	}
-
-	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		unsigned long ip = entry->caller[i];
-
-		if (ip == ULONG_MAX || !ret)
-			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
-		if (!ip) {
-			if (ret)
-				ret = trace_seq_puts(s, "??");
-			continue;
-		}
-		if (!ret)
-			break;
-		if (ret)
-			ret = seq_print_user_ip(s, mm, ip, sym_flags);
-	}
-
-	if (mm)
-		mmput(mm);
-	return ret;
-}
-
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n");
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3a35738..6bd71fa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,7 +30,7 @@ enum trace_type {
 	TRACE_HW_BRANCHES,
 	TRACE_POWER,
 
-	__TRACE_LAST_TYPE
+	__TRACE_LAST_TYPE,
 };
 
 /*
@@ -484,12 +484,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
-		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
 extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde..cb2ff3e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
 #include <linux/kallsyms.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static struct trace_array *boot_trace;
 static bool pre_initcalls_finished;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f261966..f8ac541 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_GRAPH_INDENT	2
 
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20..879752b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -14,6 +14,7 @@
 #include <asm/ds.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 
 #define SIZEOF_BTS (1 << 13)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 83f20ae..fcec59f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 struct header_iter {
 	struct pci_dev *dev;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 0000000..1f3f800
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,365 @@
+/*
+ * trace_output.c
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_output.h"
+
+/* must be a power of 2 */
+#define EVENT_HASHSIZE	128
+
+static DEFINE_MUTEX(trace_event_mutex);
+static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+
+static int next_event_type = __TRACE_LAST_TYPE + 1;
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data = mem;
+	int i, j;
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		hex[j++] = hex_asc_hi(data[i]);
+		hex[j++] = hex_asc_lo(data[i]);
+	}
+	hex[j++] = ' ';
+
+	return trace_seq_putmem(s, hex, j);
+}
+
+int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	unsigned char *p;
+
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+	if (!IS_ERR(p)) {
+		p = mangle_path(s->buffer + s->len, p, "\n");
+		if (p) {
+			s->len = p - s->buffer;
+			return 1;
+		}
+	} else {
+		s->buffer[s->len++] = '?';
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+	static const char tramp_name[] = "kretprobe_trampoline";
+	int size = sizeof(tramp_name);
+
+	if (strncmp(tramp_name, name, size) == 0)
+		return "[unknown/kretprobe'd]";
+	return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+	return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+	const char *name;
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	name = kretprobed(str);
+
+	return trace_seq_printf(s, fmt, name);
+#endif
+	return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+	const char *name;
+
+	sprint_symbol(str, address);
+	name = kretprobed(str);
+
+	return trace_seq_printf(s, fmt, name);
+#endif
+	return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+		      unsigned long ip, unsigned long sym_flags)
+{
+	struct file *file = NULL;
+	unsigned long vmstart = 0;
+	int ret = 1;
+
+	if (mm) {
+		const struct vm_area_struct *vma;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, ip);
+		if (vma) {
+			file = vma->vm_file;
+			vmstart = vma->vm_start;
+		}
+		if (file) {
+			ret = trace_seq_path(s, &file->f_path);
+			if (ret)
+				ret = trace_seq_printf(s, "[+0x%lx]",
+						       ip - vmstart);
+		}
+		up_read(&mm->mmap_sem);
+	}
+	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+		      unsigned long sym_flags)
+{
+	struct mm_struct *mm = NULL;
+	int ret = 1;
+	unsigned int i;
+
+	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+		struct task_struct *task;
+		/*
+		 * we do the lookup on the thread group leader,
+		 * since individual threads might have already quit!
+		 */
+		rcu_read_lock();
+		task = find_task_by_vpid(entry->ent.tgid);
+		if (task)
+			mm = get_task_mm(task);
+		rcu_read_unlock();
+	}
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		unsigned long ip = entry->caller[i];
+
+		if (ip == ULONG_MAX || !ret)
+			break;
+		if (i && ret)
+			ret = trace_seq_puts(s, " <- ");
+		if (!ip) {
+			if (ret)
+				ret = trace_seq_puts(s, "??");
+			continue;
+		}
+		if (!ret)
+			break;
+		if (ret)
+			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+	}
+
+	if (mm)
+		mmput(mm);
+	return ret;
+}
+
+int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		ret = seq_print_sym_offset(s, "%s", ip);
+	else
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+/**
+ * ftrace_find_event - find a registered event
+ * @type: the type of event to look for
+ *
+ * Returns an event of type @type otherwise NULL
+ */
+struct trace_event *ftrace_find_event(int type)
+{
+	struct trace_event *event;
+	struct hlist_node *n;
+	unsigned key;
+
+	key = type & (EVENT_HASHSIZE - 1);
+
+	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+		if (event->type == type)
+			return event;
+	}
+
+	return NULL;
+}
+
+/**
+ * register_ftrace_event - register output for an event type
+ * @event: the event type to register
+ *
+ * Event types are stored in a hash and this hash is used to
+ * find a way to print an event. If the @event->type is set
+ * then it will use that type, otherwise it will assign a
+ * type to use.
+ *
+ * If you assign your own type, please make sure it is added
+ * to the trace_type enum in trace.h, to avoid collisions
+ * with the dynamic types.
+ *
+ * Returns the event type number or zero on error.
+ */
+int register_ftrace_event(struct trace_event *event)
+{
+	unsigned key;
+	int ret = 0;
+
+	mutex_lock(&trace_event_mutex);
+
+	if (!event->type)
+		event->type = next_event_type++;
+	else if (event->type > __TRACE_LAST_TYPE) {
+		printk(KERN_WARNING "Need to add type to trace.h\n");
+		WARN_ON(1);
+	}
+
+	if (ftrace_find_event(event->type))
+		goto out;
+
+	key = event->type & (EVENT_HASHSIZE - 1);
+
+	hlist_add_head_rcu(&event->node, &event_hash[key]);
+
+	ret = event->type;
+ out:
+	mutex_unlock(&trace_event_mutex);
+
+	return ret;
+}
+
+/**
+ * unregister_ftrace_event - remove a no longer used event
+ * @event: the event to remove
+ */
+int unregister_ftrace_event(struct trace_event *event)
+{
+	mutex_lock(&trace_event_mutex);
+	hlist_del(&event->node);
+	mutex_unlock(&trace_event_mutex);
+
+	return 0;
+}
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 0000000..1fcc76e
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,43 @@
+#ifndef __TRACE_EVENTS_H
+#define __TRACE_EVENTS_H
+
+#include "trace.h"
+
+typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
+				int flags);
+
+struct trace_event {
+	struct hlist_node	node;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	latency_trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+		unsigned long sym_flags);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+int trace_seq_puts(struct trace_seq *s, const char *str);
+int trace_seq_putc(struct trace_seq *s, unsigned char c);
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_path(struct trace_seq *s, struct path *path);
+int seq_print_userip_objs(const struct userstack_entry *entry,
+			  struct trace_seq *s, unsigned long sym_flags);
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+		      unsigned long ip, unsigned long sym_flags);
+
+struct trace_event *ftrace_find_event(int type);
+int register_ftrace_event(struct trace_event *event);
+int unregister_ftrace_event(struct trace_event *event);
+
+#define MAX_MEMHEX_BYTES	8
+#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
+
+#endif
+
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a3..b9b13c3 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
-- 
cgit v1.1


From f633cef0200bbaec539e2dbb0bc4bed7f022f98b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:13 -0500
Subject: ftrace: change trace.c to use registered events

Impact: rework trace.c to use new event register API

Almost every ftrace event has to implement its output display in
trace.c through a different function. Some events did not handle
all the formats (trace, latency-trace, raw, hex, binary), and
this method does not scale well.

This patch converts the format functions to use the event API to
find the event and and print its format. Currently, we have
a print function for trace, latency_trace, raw, hex and binary.
A trace_nop_print is available if the event wants to avoid output
on a particular format.

Perhaps other tracers could use this in the future (like mmiotrace and
function_graph).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 402 ++++----------------------------------
 kernel/trace/trace_branch.c |  53 +++++
 kernel/trace/trace_output.c | 467 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |  16 ++
 4 files changed, 574 insertions(+), 364 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 90ce0c1..3f03175 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1483,15 +1483,6 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 		trace_seq_puts(s, " : ");
 }
 
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
-	int bit = state ? __ffs(state) + 1 : 0;
-
-	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1515,14 +1506,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *next_entry;
+	struct trace_event *event;
 	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
 	struct trace_entry *entry = iter->ent;
 	unsigned long abs_usecs;
 	unsigned long rel_usecs;
 	u64 next_ts;
 	char *comm;
-	int S, T;
-	int i;
+	int ret;
 
 	test_cpu_buff_start(iter);
 
@@ -1547,94 +1538,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		lat_print_generic(s, entry, cpu);
 		lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_puts(s, " (");
-		seq_print_ip_sym(s, field->parent_ip, sym_flags);
-		trace_seq_puts(s, ")\n");
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = task_state_char(field->prev_state);
-		comm = trace_find_cmdline(field->next_pid);
-		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-				 field->prev_pid,
-				 field->prev_prio,
-				 S, entry->type == TRACE_CTX ? "==>" : "  +",
-				 field->next_cpu,
-				 field->next_pid,
-				 field->next_prio,
-				 T, comm);
-		break;
-	}
-	case TRACE_SPECIAL: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		break;
-	}
-	case TRACE_STACK: {
-		struct stack_entry *field;
-
-		trace_assign_type(field, entry);
-
-		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-			if (i)
-				trace_seq_puts(s, " <= ");
-			seq_print_ip_sym(s, field->caller[i], sym_flags);
-		}
-		trace_seq_puts(s, "\n");
-		break;
-	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
 
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_printf(s, ": %s", field->buf);
-		break;
-	}
-	case TRACE_BRANCH: {
-		struct trace_branch *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "  ok  " : " MISS ",
-				 field->func,
-				 field->file,
-				 field->line);
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->latency_trace) {
+		ret = event->latency_trace(s, entry, sym_flags);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_USER_STACK: {
-		struct userstack_entry *field;
 
-		trace_assign_type(field, entry);
-
-		seq_print_userip_objs(field, s, sym_flags);
-		trace_seq_putc(s, '\n');
-		break;
-	}
-	default:
-		trace_seq_printf(s, "Unknown type %d\n", entry->type);
-	}
+	trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1643,13 +1556,12 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
+	struct trace_event *event;
 	unsigned long usec_rem;
 	unsigned long long t;
 	unsigned long secs;
 	char *comm;
 	int ret;
-	int S, T;
-	int i;
 
 	entry = iter->ent;
 
@@ -1671,127 +1583,17 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = seq_print_ip_sym(s, field->ip, sym_flags);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
-						field->parent_ip) {
-			ret = trace_seq_printf(s, " <-");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-			ret = seq_print_ip_sym(s,
-					       field->parent_ip,
-					       sym_flags);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		ret = trace_seq_printf(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = task_state_char(field->prev_state);
-		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
-				       field->prev_pid,
-				       field->prev_prio,
-				       S,
-				       entry->type == TRACE_CTX ? "==>" : "  +",
-				       field->next_cpu,
-				       field->next_pid,
-				       field->next_prio,
-				       T);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_SPECIAL: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_STACK: {
-		struct stack_entry *field;
-
-		trace_assign_type(field, entry);
-
-		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-			if (i) {
-				ret = trace_seq_puts(s, " <= ");
-				if (!ret)
-					return TRACE_TYPE_PARTIAL_LINE;
-			}
-			ret = seq_print_ip_sym(s, field->caller[i],
-					       sym_flags);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		ret = trace_seq_puts(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
-
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_printf(s, ": %s", field->buf);
-		break;
-	}
-	case TRACE_GRAPH_RET: {
-		return print_graph_function(iter);
-	}
-	case TRACE_GRAPH_ENT: {
-		return print_graph_function(iter);
-	}
-	case TRACE_BRANCH: {
-		struct trace_branch *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "  ok  " : " MISS ",
-				 field->func,
-				 field->file,
-				 field->line);
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->trace) {
+		ret = event->trace(s, entry, sym_flags);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_USER_STACK: {
-		struct userstack_entry *field;
-
-		trace_assign_type(field, entry);
+	ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = seq_print_userip_objs(field, s, sym_flags);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_putc(s, '\n');
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	}
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1799,8 +1601,8 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
+	struct trace_event *event;
 	int ret;
-	int S, T;
 
 	entry = iter->ent;
 
@@ -1809,86 +1611,26 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "%x %x\n",
-					field->ip,
-					field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = entry->type == TRACE_WAKE ? '+' :
-			task_state_char(field->prev_state);
-		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
-				       field->prev_pid,
-				       field->prev_prio,
-				       S,
-				       field->next_cpu,
-				       field->next_pid,
-				       field->next_prio,
-				       T);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->raw) {
+		ret = event->raw(s, entry, 0);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
+	ret = trace_seq_printf(s, "%d ?\n", entry->type);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
-		break;
-	}
-	}
 	return TRACE_TYPE_HANDLED;
 }
 
-#define SEQ_PUT_FIELD_RET(s, x)				\
-do {							\
-	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
-		return 0;				\
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
-do {							\
-	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
-	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
-		return 0;				\
-} while (0)
-
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned char newline = '\n';
 	struct trace_entry *entry;
-	int S, T;
+	struct trace_event *event;
 
 	entry = iter->ent;
 
@@ -1896,47 +1638,10 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_HEX_FIELD_RET(s, field->ip);
-		SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = entry->type == TRACE_WAKE ? '+' :
-			task_state_char(field->prev_state);
-		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
-		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
-		SEQ_PUT_HEX_FIELD_RET(s, S);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
-		SEQ_PUT_HEX_FIELD_RET(s, T);
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
+	event = ftrace_find_event(entry->type);
+	if (event && event->hex)
+		event->hex(s, entry, 0);
 
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-		break;
-	}
-	}
 	SEQ_PUT_FIELD_RET(s, newline);
 
 	return TRACE_TYPE_HANDLED;
@@ -1962,6 +1667,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
+	struct trace_event *event;
 
 	entry = iter->ent;
 
@@ -1969,43 +1675,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	SEQ_PUT_FIELD_RET(s, entry->cpu);
 	SEQ_PUT_FIELD_RET(s, iter->ts);
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
+	event = ftrace_find_event(entry->type);
+	if (event && event->binary)
+		event->binary(s, entry, 0);
 
-		SEQ_PUT_FIELD_RET(s, field->ip);
-		SEQ_PUT_FIELD_RET(s, field->parent_ip);
-		break;
-	}
-	case TRACE_CTX: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_FIELD_RET(s, field->prev_pid);
-		SEQ_PUT_FIELD_RET(s, field->prev_prio);
-		SEQ_PUT_FIELD_RET(s, field->prev_state);
-		SEQ_PUT_FIELD_RET(s, field->next_pid);
-		SEQ_PUT_FIELD_RET(s, field->next_prio);
-		SEQ_PUT_FIELD_RET(s, field->next_state);
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_FIELD_RET(s, field->arg1);
-		SEQ_PUT_FIELD_RET(s, field->arg2);
-		SEQ_PUT_FIELD_RET(s, field->arg3);
-		break;
-	}
-	}
-	return 1;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_empty(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb..c15222a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,7 +14,9 @@
 #include <linux/hash.h>
 #include <linux/fs.h>
 #include <asm/local.h>
+
 #include "trace.h"
+#include "trace_output.h"
 
 #ifdef CONFIG_BRANCH_TRACER
 
@@ -142,6 +144,49 @@ static void branch_trace_reset(struct trace_array *tr)
 	stop_branch_trace(tr);
 }
 
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct trace_branch *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+			     field->correct ? "  ok  " : " MISS ",
+			     field->func,
+			     field->file,
+			     field->line))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static struct trace_event trace_branch_event = {
+	.type	 	= TRACE_BRANCH,
+	.trace		= trace_branch_print,
+	.latency_trace	= trace_branch_print,
+	.raw		= trace_nop_print,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 struct tracer branch_trace __read_mostly =
 {
 	.name		= "branch",
@@ -154,6 +199,14 @@ struct tracer branch_trace __read_mostly =
 
 __init static int init_branch_trace(void)
 {
+	int ret;
+
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register branch events\n");
+		return 1;
+	}
+
 	return register_tracer(&branch_trace);
 }
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1f3f800..df0c25c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,6 +286,15 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int task_state_char(unsigned long state)
+{
+	int bit = state ? __ffs(state) + 1 : 0;
+
+	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
 /**
  * ftrace_find_event - find a registered event
  * @type: the type of event to look for
@@ -363,3 +372,461 @@ int unregister_ftrace_event(struct trace_event *event)
 
 	return 0;
 }
+
+/*
+ * Standard events
+ */
+
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return 0;
+}
+
+/* TRACE_FN */
+static int
+trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+	if (!trace_seq_puts(s, " ("))
+		goto partial;
+	if (!seq_print_ip_sym(s, field->parent_ip, flags))
+		goto partial;
+	if (!trace_seq_puts(s, ")\n"))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
+		if (!trace_seq_printf(s, " <-"))
+			goto partial;
+		if (!seq_print_ip_sym(s,
+				      field->parent_ip,
+				      flags))
+			goto partial;
+	}
+	if (!trace_seq_printf(s, "\n"))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "%x %x\n",
+			     field->ip,
+			     field->parent_ip))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+
+	return 0;
+}
+
+static int
+trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->ip);
+	SEQ_PUT_FIELD_RET(s, field->parent_ip);
+
+	return 0;
+}
+
+static struct trace_event trace_fn_event = {
+	.type	 	= TRACE_FN,
+	.trace		= trace_fn_trace,
+	.latency_trace	= trace_fn_latency,
+	.raw		= trace_fn_raw,
+	.hex		= trace_fn_hex,
+	.binary		= trace_fn_bin,
+};
+
+/* TRACE_CTX an TRACE_WAKE */
+static int
+trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
+		    char *delim)
+{
+	struct ctx_switch_entry *field;
+	char *comm;
+	int S, T;
+
+	trace_assign_type(field, entry);
+
+	T = task_state_char(field->next_state);
+	S = task_state_char(field->prev_state);
+	comm = trace_find_cmdline(field->next_pid);
+	if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+			     field->prev_pid,
+			     field->prev_prio,
+			     S, delim,
+			     field->next_cpu,
+			     field->next_pid,
+			     field->next_prio,
+			     T, comm))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_print(s, entry, flags, "==>");
+}
+
+static int
+trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_print(s, entry, flags, "  +");
+}
+
+static int
+trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
+		  char S)
+{
+	struct ctx_switch_entry *field;
+	int T;
+
+	trace_assign_type(field, entry);
+
+	if (!S)
+		task_state_char(field->prev_state);
+	T = task_state_char(field->next_state);
+	if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+			     field->prev_pid,
+			     field->prev_prio,
+			     S,
+			     field->next_cpu,
+			     field->next_pid,
+			     field->next_prio,
+			     T))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_raw(s, entry, flags, 0);
+}
+
+static int
+trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_raw(s, entry, flags, '+');
+}
+
+
+static int
+trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
+		  char S)
+{
+	struct ctx_switch_entry *field;
+	int T;
+
+	trace_assign_type(field, entry);
+
+	if (!S)
+		task_state_char(field->prev_state);
+	T = task_state_char(field->next_state);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+	SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+	SEQ_PUT_HEX_FIELD_RET(s, S);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+	SEQ_PUT_HEX_FIELD_RET(s, T);
+
+	return 0;
+}
+
+static int
+trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_hex(s, entry, flags, 0);
+}
+
+static int
+trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_hex(s, entry, flags, '+');
+}
+
+static int
+trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ctx_switch_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->prev_pid);
+	SEQ_PUT_FIELD_RET(s, field->prev_prio);
+	SEQ_PUT_FIELD_RET(s, field->prev_state);
+	SEQ_PUT_FIELD_RET(s, field->next_pid);
+	SEQ_PUT_FIELD_RET(s, field->next_prio);
+	SEQ_PUT_FIELD_RET(s, field->next_state);
+
+	return 0;
+}
+
+static struct trace_event trace_ctx_event = {
+	.type	 	= TRACE_CTX,
+	.trace		= trace_ctx_print,
+	.latency_trace	= trace_ctx_print,
+	.raw		= trace_ctx_raw,
+	.hex		= trace_ctx_hex,
+	.binary		= trace_ctxwake_bin,
+};
+
+static struct trace_event trace_wake_event = {
+	.type	 	= TRACE_WAKE,
+	.trace		= trace_wake_print,
+	.latency_trace	= trace_wake_print,
+	.raw		= trace_wake_raw,
+	.hex		= trace_wake_hex,
+	.binary		= trace_ctxwake_bin,
+};
+
+/* TRACE_SPECIAL */
+static int
+trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "# %ld %ld %ld\n",
+			     field->arg1,
+			     field->arg2,
+			     field->arg3))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
+
+	return 0;
+}
+
+static int
+trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->arg1);
+	SEQ_PUT_FIELD_RET(s, field->arg2);
+	SEQ_PUT_FIELD_RET(s, field->arg3);
+
+	return 0;
+}
+
+static struct trace_event trace_special_event = {
+	.type	 	= TRACE_SPECIAL,
+	.trace		= trace_special_print,
+	.latency_trace	= trace_special_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_STACK */
+
+static int
+trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct stack_entry *field;
+	int i;
+
+	trace_assign_type(field, entry);
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		if (i) {
+			if (trace_seq_puts(s, " <= "))
+				goto partial;
+
+			if (seq_print_ip_sym(s, field->caller[i], flags))
+				goto partial;
+		}
+		if (trace_seq_puts(s, "\n"))
+			goto partial;
+	}
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_stack_event = {
+	.type	 	= TRACE_STACK,
+	.trace		= trace_stack_print,
+	.latency_trace	= trace_stack_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_USER_STACK */
+static int
+trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
+		       int flags)
+{
+	struct userstack_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_userip_objs(field, s, flags))
+		goto partial;
+
+	if (trace_seq_putc(s, '\n'))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_user_stack_event = {
+	.type	 	= TRACE_USER_STACK,
+	.trace		= trace_user_stack_print,
+	.latency_trace	= trace_user_stack_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_PRINT */
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.latency_trace	= trace_print_print,
+	.raw		= trace_print_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
+static struct trace_event *events[] __initdata = {
+	&trace_fn_event,
+	&trace_ctx_event,
+	&trace_wake_event,
+	&trace_special_event,
+	&trace_stack_event,
+	&trace_user_stack_event,
+	&trace_print_event,
+	NULL
+};
+
+__init static int init_events(void)
+{
+	struct trace_event *event;
+	int i, ret;
+
+	for (i = 0; events[i]; i++) {
+		event = events[i];
+
+		ret = register_ftrace_event(event);
+		if (!ret) {
+			printk(KERN_WARNING "event %d failed to register\n",
+			       event->type);
+			WARN_ON_ONCE(1);
+		}
+	}
+
+	return 0;
+}
+device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1fcc76e..ecab4ea 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -36,8 +36,24 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
 #endif
 
-- 
cgit v1.1


From dbd0b4b33074aa6b7832a9d9a5bd985eca5c1aa2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 28 Dec 2008 20:44:51 -0800
Subject: tracing/ftrace: provide the base infrastructure for histogram tracing

Impact: extend the tracing API

The goal of this patch is to normalize and make more easy the
implementation of statistical (histogram) tracing.

It implements a trace_stat file into the /debugfs/tracing directory where
one can print a one-shot output of statistics/histogram entries.

A tracer has to provide two basic iterator callbacks:

  stat_start() => the first entry
  stat_next(prev, idx) => the next one.

Note that it is adapted for arrays or hash tables or lists.... since it
provides a pointer to the previous entry and the current index of the
iterator.

These two callbacks are called to get a snapshot of the statistics at each
opening of the trace_stat file because. The values are so updated between
two "cat trace_stat". And the tracer is free to lock its datas during the
iteration to keep consistent values.

Since it is almost always interesting to sort statisticals values to
address the problems by priority, this infrastructure provides a "sorting"
of the stat entries too if desired. A tracer has just to provide a
stat_cmp callback to compare two entries and the stat tracing
infrastructure will build a sorted list of the given entries.

A last callback, called stat_headers, can be implemented by a tracer to
output headers on its trace.

If one of these callbacks is changed on runtime, it just have to signal it
to the stat tracing API by calling the init_tracer_stat() helper.

Changes in V2:

- Fix a memory leak if the user opens multiple times the trace_stat file
  without closing it. Now we always free our list before rebuilding it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile     |   1 +
 kernel/trace/trace.c      |   3 +-
 kernel/trace/trace.h      |  17 ++++
 kernel/trace/trace_stat.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/trace_stat.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 549f93c..31cd5fb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3f03175..b789c01 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2354,6 +2354,7 @@ static int tracing_set_tracer(char *buf)
 		if (ret)
 			goto out;
 	}
+	init_tracer_stat(t);
 
 	trace_branch_enable(tr);
  out:
@@ -3206,7 +3207,7 @@ __init static int tracer_alloc_buffers(void)
 #else
 	current_trace = &nop_trace;
 #endif
-
+	init_tracer_stat(current_trace);
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bd71fa..05fa804 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,6 +336,21 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags 	*flags;
+
+	/*
+	 * If you change one of the following on tracing runtime, recall
+	 * init_tracer_stat()
+	 */
+
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for sorting (optional) for stats */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
 };
 
 struct trace_seq {
@@ -421,6 +436,8 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
+void init_tracer_stat(struct tracer *trace);
+
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_max_latency;
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 0000000..6f194a3
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,251 @@
+/*
+ * Infrastructure for statistic tracing (histogram output).
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Based on the code from trace_branch.c which is
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include "trace.h"
+
+
+/* List of stat entries from a tracer */
+struct trace_stat_list {
+	struct list_head list;
+	void *stat;
+};
+
+static struct trace_stat_list stat_list;
+
+/*
+ * This is a copy of the current tracer to avoid racy
+ * and dangerous output while the current tracer is
+ * switched.
+ */
+static struct tracer current_tracer;
+
+/*
+ * Protect both the current tracer and the global
+ * stat list.
+ */
+static DEFINE_MUTEX(stat_list_mutex);
+
+
+static void reset_stat_list(void)
+{
+	struct trace_stat_list *node;
+	struct list_head *next;
+
+	if (list_empty(&stat_list.list))
+		return;
+
+	node = list_entry(stat_list.list.next, struct trace_stat_list, list);
+	next = node->list.next;
+
+	while (&node->list != next) {
+		kfree(node);
+		node = list_entry(next, struct trace_stat_list, list);
+	}
+	kfree(node);
+
+	INIT_LIST_HEAD(&stat_list.list);
+}
+
+void init_tracer_stat(struct tracer *trace)
+{
+	mutex_lock(&stat_list_mutex);
+	current_tracer = *trace;
+	mutex_unlock(&stat_list_mutex);
+}
+
+/*
+ * For tracers that don't provide a stat_cmp callback.
+ * This one will force an immediate insertion on tail of
+ * the list.
+ */
+static int dummy_cmp(void *p1, void *p2)
+{
+	return 1;
+}
+
+/*
+ * Initialize the stat list at each trace_stat file opening.
+ * All of these copies and sorting are required on all opening
+ * since the stats could have changed between two file sessions.
+ */
+static int stat_seq_init(void)
+{
+	struct trace_stat_list *iter_entry, *new_entry;
+	void *prev_stat;
+	int ret = 0;
+	int i;
+
+	mutex_lock(&stat_list_mutex);
+	reset_stat_list();
+
+	if (!current_tracer.stat_start || !current_tracer.stat_next ||
+					!current_tracer.stat_show)
+		goto exit;
+
+	if (!current_tracer.stat_cmp)
+		current_tracer.stat_cmp = dummy_cmp;
+
+	/*
+	 * The first entry. Actually this is the second, but the first
+	 * one (the stat_list head) is pointless.
+	 */
+	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+	if (!new_entry) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	INIT_LIST_HEAD(&new_entry->list);
+	list_add(&new_entry->list, &stat_list.list);
+	new_entry->stat = current_tracer.stat_start();
+
+	prev_stat = new_entry->stat;
+
+	/*
+	 * Iterate over the tracer stat entries and store them in a sorted
+	 * list.
+	 */
+	for (i = 1; ; i++) {
+		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+		if (!new_entry) {
+			ret = -ENOMEM;
+			goto exit_free_list;
+		}
+
+		INIT_LIST_HEAD(&new_entry->list);
+		new_entry->stat = current_tracer.stat_next(prev_stat, i);
+
+		/* End of insertion */
+		if (!new_entry->stat)
+			break;
+
+		list_for_each_entry(iter_entry, &stat_list.list, list) {
+			/* Insertion with a descendent sorting */
+			if (current_tracer.stat_cmp(new_entry->stat,
+						iter_entry->stat) > 0) {
+
+				list_add_tail(&new_entry->list,
+						&iter_entry->list);
+				break;
+
+			/* The current smaller value */
+			} else if (list_is_last(&iter_entry->list,
+						&stat_list.list)) {
+				list_add(&new_entry->list, &iter_entry->list);
+				break;
+			}
+		}
+
+		prev_stat = new_entry->stat;
+	}
+exit:
+	mutex_unlock(&stat_list_mutex);
+	return ret;
+
+exit_free_list:
+	reset_stat_list();
+	mutex_unlock(&stat_list_mutex);
+	return ret;
+}
+
+
+static void *stat_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+
+	/* Prevent from tracer switch or stat_list modification */
+	mutex_lock(&stat_list_mutex);
+
+	/* If we are in the beginning of the file, print the headers */
+	if (!*pos && current_tracer.stat_headers)
+		current_tracer.stat_headers(s);
+
+	return seq_list_start(&l->list, *pos);
+}
+
+static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
+{
+	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+
+	return seq_list_next(p, &l->list, pos);
+}
+
+static void stat_seq_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&stat_list_mutex);
+}
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+	return current_tracer.stat_show(s, l->stat);
+}
+
+static const struct seq_operations trace_stat_seq_ops = {
+	.start = stat_seq_start,
+	.next = stat_seq_next,
+	.stop = stat_seq_stop,
+	.show = stat_seq_show
+};
+
+static int tracing_stat_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &trace_stat_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = &stat_list;
+		ret = stat_seq_init();
+	}
+
+	return ret;
+}
+
+
+/*
+ * Avoid consuming memory with our now useless list.
+ */
+static int tracing_stat_release(struct inode *i, struct file *f)
+{
+	mutex_lock(&stat_list_mutex);
+	reset_stat_list();
+	mutex_unlock(&stat_list_mutex);
+	return 0;
+}
+
+static const struct file_operations tracing_stat_fops = {
+	.open		= tracing_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_stat_release
+};
+
+static int __init tracing_stat_init(void)
+{
+	struct dentry *d_tracing;
+	struct dentry *entry;
+
+	INIT_LIST_HEAD(&stat_list.list);
+	d_tracing = tracing_init_dentry();
+
+	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
+					NULL,
+				    &tracing_stat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'trace_stat' entry\n");
+	return 0;
+}
+fs_initcall(tracing_stat_init);
-- 
cgit v1.1


From e302cf3f961ceb54c1dd0aff7ba8531df83be07a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 27 Dec 2008 23:25:38 +0100
Subject: tracing/branch-tracer: adapt to the stat tracing API

Impact: refactor the branch tracer

This patch adapts the branch tracer to the tracing API.

This is a proof of concept because the branch tracer implements two
"stat tracing" that were split in two files.

So I added an option to the branch tracer: stat_all_branch.
If it is set, then trace_stat will output all of the branches
entries stats. Otherwise, it will print the annotated branches.

Its is a kind of quick trick, waiting for a better solution.

By default, the annotated branches stat are sorted by incorrect branch
prediction percentage.

Ie:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0        1 100 native_smp_prepare_cpus        smpboot.c            1228
       0        1 100 hpet_rtc_timer_reinit          hpet.c               1057
       0    18032 100 sched_info_queued              sched_stats.h        223
       0      684 100 yield_task_fair                sched_fair.c         984
       0      282 100 pre_schedule_rt                sched_rt.c           1263
       0    13414 100 sched_info_dequeued            sched_stats.h        178
       0    21724 100 sched_info_switch              sched_stats.h        270
       0        1 100 get_signal_to_deliver          signal.c             1820
       0        8 100 __cancel_work_timer            workqueue.c          560
       0      212 100 verify_export_symbols          module.c             1509
       0       17 100 __rmqueue_fallback             page_alloc.c         793
       0       43 100 clear_page_mlock               internal.h           129
       0      124 100 try_to_unmap_anon              rmap.c               1021
       0       53 100 try_to_unmap_anon              rmap.c               1013
       0        6 100 vma_address                    rmap.c               232
       0     3301 100 try_to_unmap_file              rmap.c               1082
       0      466 100 try_to_unmap_file              rmap.c               1077
       0        1 100 mem_cgroup_create              memcontrol.c         1090
       0        3 100 inotify_find_update_watch      inotify.c            726
       2    30163  99 perf_counter_task_sched_out    perf_counter.c       385
       1     2935  99 percpu_free                    allocpercpu.c        138
    1544   297672  99 dentry_lru_del_init            dcache.c             153
       8     1074  99 input_pass_event               input.c              86
    1390    76781  98 mapping_unevictable            pagemap.h            50
     280     6665  95 pick_next_task_rt              sched_rt.c           889
     750     4826  86 next_pidmap                    pid.c                194
       2        8  80 blocking_notifier_chain_regist notifier.c           220
      36      130  78 ioremap_pte_range              ioremap.c            22
    1093     3247  74 IS_ERR                         err.h                34
    1023     2908  73 sched_slice                    sched_fair.c         445
      22       60  73 disk_put_part                  genhd.h              206
[...]

It enables a developer to quickly address the source of incorrect branch
predictions.  Note that this sorting would be better with a second sort on
the number of incorrect predictions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 268 +++++++++++++++++++++++---------------------
 1 file changed, 142 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c15222a..4785a3b 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -18,10 +18,13 @@
 #include "trace.h"
 #include "trace_output.h"
 
+static struct tracer branch_trace;
+
 #ifdef CONFIG_BRANCH_TRACER
 
 static int branch_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(branch_tracing_mutex);
+
 static struct trace_array *branch_tracer;
 
 static void
@@ -178,6 +181,7 @@ trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
+
 static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
@@ -187,30 +191,6 @@ static struct trace_event trace_branch_event = {
 	.binary		= trace_nop_print,
 };
 
-struct tracer branch_trace __read_mostly =
-{
-	.name		= "branch",
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif
-};
-
-__init static int init_branch_trace(void)
-{
-	int ret;
-
-	ret = register_ftrace_event(&trace_branch_event);
-	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register branch events\n");
-		return 1;
-	}
-
-	return register_tracer(&branch_trace);
-}
-
-device_initcall(init_branch_trace);
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -236,66 +216,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
-struct ftrace_pointer {
-	void		*start;
-	void		*stop;
-	int		hit;
-};
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
 
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+static int annotated_branch_stat_headers(struct seq_file *m)
 {
-	const struct ftrace_pointer *f = m->private;
-	struct ftrace_branch_data *p = v;
-
-	(*pos)++;
-
-	if (v == (void *)1)
-		return f->start;
-
-	++p;
-
-	if ((void *)p >= (void *)f->stop)
-		return NULL;
-
-	return p;
+	seq_printf(m, " correct incorrect  %% ");
+	seq_printf(m, "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+	return 0;
 }
 
-static void *t_start(struct seq_file *m, loff_t *pos)
+static inline long get_incorrect_percent(struct ftrace_branch_data *p)
 {
-	void *t = (void *)1;
-	loff_t l = 0;
-
-	for (; t && l < *pos; t = t_next(m, t, &l))
-		;
+	long percent;
 
-	return t;
-}
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : -1;
 
-static void t_stop(struct seq_file *m, void *p)
-{
+	return percent;
 }
 
-static int t_show(struct seq_file *m, void *v)
+static int branch_stat_show(struct seq_file *m, void *v)
 {
-	const struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
 
-	if (v == (void *)1) {
-		if (fp->hit)
-			seq_printf(m, "   miss      hit    %% ");
-		else
-			seq_printf(m, " correct incorrect  %% ");
-		seq_printf(m, "       Function                "
-			      "  File              Line\n"
-			      " ------- ---------  - "
-			      "       --------                "
-			      "  ----              ----\n");
-		return 0;
-	}
-
 	/* Only print the file, not the path */
 	f = p->file + strlen(p->file);
 	while (f >= p->file && *f != '/')
@@ -305,11 +258,7 @@ static int t_show(struct seq_file *m, void *v)
 	/*
 	 * The miss is overlayed on correct, and hit on incorrect.
 	 */
-	if (p->correct) {
-		percent = p->incorrect * 100;
-		percent /= p->correct + p->incorrect;
-	} else
-		percent = p->incorrect ? 100 : -1;
+	percent = get_incorrect_percent(p);
 
 	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
 	if (percent < 0)
@@ -320,76 +269,143 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations tracing_likely_seq_ops = {
-	.start		= t_start,
-	.next		= t_next,
-	.stop		= t_stop,
-	.show		= t_show,
-};
+static void *annotated_branch_stat_start(void)
+{
+	return __start_annotated_branch_profile;
+}
 
-static int tracing_branch_open(struct inode *inode, struct file *file)
+static void *
+annotated_branch_stat_next(void *v, int idx)
 {
-	int ret;
+	struct ftrace_branch_data *p = v;
 
-	ret = seq_open(file, &tracing_likely_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = (void *)inode->i_private;
-	}
+	++p;
 
-	return ret;
+	if ((void *)p >= (void *)__stop_annotated_branch_profile)
+		return NULL;
+
+	return p;
 }
 
-static const struct file_operations tracing_branch_fops = {
-	.open		= tracing_branch_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-};
+static int annotated_branch_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_branch_data *a = p1;
+	struct ftrace_branch_data *b = p2;
+
+	long percent_a, percent_b;
+
+	percent_a = get_incorrect_percent(a);
+	percent_b = get_incorrect_percent(b);
+
+	if (percent_a < percent_b)
+		return -1;
+	if (percent_a > percent_b)
+		return 1;
+	else
+		return 0;
+}
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-extern unsigned long __start_branch_profile[];
-extern unsigned long __stop_branch_profile[];
+enum {
+	TRACE_BRANCH_OPT_ALL = 0x1
+};
 
-static const struct ftrace_pointer ftrace_branch_pos = {
-	.start			= __start_branch_profile,
-	.stop			= __stop_branch_profile,
-	.hit			= 1,
+static struct tracer_opt branch_opts[] = {
+	{ TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) },
+	{ }
 };
 
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+static struct tracer_flags branch_flags = {
+	.val = 0,
+	.opts = branch_opts
+};
 
-extern unsigned long __start_annotated_branch_profile[];
-extern unsigned long __stop_annotated_branch_profile[];
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
 
-static const struct ftrace_pointer ftrace_annotated_branch_pos = {
-	.start			= __start_annotated_branch_profile,
-	.stop			= __stop_annotated_branch_profile,
-};
+static int all_branch_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   miss      hit    %% ");
+	seq_printf(m, "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+	return 0;
+}
 
-static __init int ftrace_branch_init(void)
+static void *all_branch_stat_start(void)
 {
-	struct dentry *d_tracer;
-	struct dentry *entry;
+	return __start_branch_profile;
+}
+
+static void *
+all_branch_stat_next(void *v, int idx)
+{
+	struct ftrace_branch_data *p = v;
 
-	d_tracer = tracing_init_dentry();
+	++p;
 
-	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
-				    (void *)&ftrace_annotated_branch_pos,
-				    &tracing_branch_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'profile_annotatet_branch' entry\n");
+	if ((void *)p >= (void *)__stop_branch_profile)
+		return NULL;
 
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
-	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
-				    (void *)&ftrace_branch_pos,
-				    &tracing_branch_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_branch' entry\n");
-#endif
+	return p;
+}
 
+static int branch_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_BRANCH_OPT_ALL) {
+		if (set) {
+			branch_trace.stat_headers = all_branch_stat_headers;
+			branch_trace.stat_start = all_branch_stat_start;
+			branch_trace.stat_next = all_branch_stat_next;
+			branch_trace.stat_cmp = NULL;
+		} else {
+			branch_trace.stat_headers =
+				annotated_branch_stat_headers;
+			branch_trace.stat_start = annotated_branch_stat_start;
+			branch_trace.stat_next = annotated_branch_stat_next;
+			branch_trace.stat_cmp = annotated_branch_stat_cmp;
+		}
+		init_tracer_stat(&branch_trace);
+	}
 	return 0;
 }
 
-device_initcall(ftrace_branch_init);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+static struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+#ifdef CONFIG_BRANCH_TRACER
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+#endif /* CONFIG_BRANCH_TRACER */
+	.stat_start	=	annotated_branch_stat_start,
+	.stat_next	= annotated_branch_stat_next,
+	.stat_show	= branch_stat_show,
+	.stat_headers	= annotated_branch_stat_headers,
+	.stat_cmp	= annotated_branch_stat_cmp,
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+	.flags	= &branch_flags,
+	.set_flag	= branch_set_flag,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+#ifdef CONFIG_BRANCH_TRACER
+	int ret;
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register branch events\n");
+		return 1;
+	}
+#endif
+
+	return register_tracer(&branch_trace);
+}
+device_initcall(init_branch_trace);
-- 
cgit v1.1


From f7d48cbde5c0710008caeaf7dbf14f4a9b064940 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 13:02:17 +0100
Subject: tracing/ftrace: make trace_find_cmdline() generally available

Impact: build fix

On !CONFIG_CONTEXT_SWITCH_TRACER trace_find_cmdline() is not defined:

 kernel/trace/trace_output.c: In function 'trace_ctxwake_print':
 kernel/trace/trace_output.c:499: error: implicit declaration of function 'trace_find_cmdline'
 kernel/trace/trace_output.c:499: warning: assignment makes pointer from integer without a cast

Move it to the generic section in trace.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 05fa804..a8b624c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -469,10 +469,10 @@ struct tracer_switch_ops {
 	void				*private;
 	struct tracer_switch_ops	*next;
 };
-
-char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
+extern char *trace_find_cmdline(int pid);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
-- 
cgit v1.1


From a103e2ab7377dbbef2506be59c49a3f2ae10b60b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 15:07:47 +0100
Subject: tracing/selftest: remove TRACE_CONT reference

Impact: build fix

TRACE_CONT is gone - fix up the self-test too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb7..5013812 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,7 +9,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_FN:
 	case TRACE_CTX:
 	case TRACE_WAKE:
-	case TRACE_CONT:
 	case TRACE_STACK:
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
-- 
cgit v1.1


From 7a51cffbd10886c0557677dd916c090097c691ef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 16:03:40 +0100
Subject: relayfs: replace BUG() with WARN_ON() in relay_late_setup_files()

Impact: turn boot crash into boot warning

This BUG() can trigger:

[   16.684131] initcall fail_page_alloc_debugfs+0x0/0xc1 returned 0 after 0 usecs
[   16.692035] calling  kmemtrace_setup_late+0x0/0xd5 @ 1
[   16.700087] relay_late_setup_files: CPU 1 has no buffer, it must have!
[   16.704044] ------------[ cut here ]------------
[   16.708030] kernel BUG at kernel/relay.c:680!
[   16.708030] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[   16.708030] last sysfs file:
[   16.708030]
[   16.708030] Pid: 1, comm: swapper Not tainted (2.6.28-tip-03903-g9a39f58-dirty #13207) System Product Name
[   16.708030] EIP: 0060:[<c01604ae>] EFLAGS: 00010246 CPU: 1
[   16.708030] EIP is at relay_late_setup_files+0x8c/0x176

Reduce it to a more reportable WARN_ONCE().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/relay.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac200..d064506 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -675,9 +675,7 @@ int relay_late_setup_files(struct rchan *chan,
 	 */
 	for_each_online_cpu(i) {
 		if (unlikely(!chan->buf[i])) {
-			printk(KERN_ERR "relay_late_setup_files: CPU %u "
-					"has no buffer, it must have!\n", i);
-			BUG();
+			WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
 			err = -EINVAL;
 			break;
 		}
-- 
cgit v1.1


From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 29 Dec 2008 13:42:23 -0800
Subject: tracing/kmemtrace: normalize the raw tracer event to the unified
 tracing API

Impact: new tracer plugin

This patch adapts kmemtrace raw events tracing to the unified tracing API.

To enable and use this tracer, just do the following:

 echo kmemtrace > /debugfs/tracing/current_tracer
 cat /debugfs/tracing/trace

You will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

type_id 1 call_site 18446744071565527833 ptr 18446612134395152256
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584

That was to stay backward compatible with the format output produced in
inux/tracepoint.h.

This is the default ouput, but note that I tried something else.

If you change an option:

echo kmem_minimalistic > /debugfs/trace_options

and then cat /debugfs/trace, you will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

   -      C                            0xffff88007c088780          file_free_rcu
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc780     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc870     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc960     -1   d_alloc
   +      K   1304   1312   000000d0   0xffff8800791d7340     -1   reiserfs_alloc_inode
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K    992   1000   000000d0   0xffff880079045b58     -1   alloc_inode
   +      K    768   1024   000080d0   0xffff88007c096400     -1   alloc_pipe_info
   +      K    240    240   000000d0   0xffff8800790dca50     -1   d_alloc
   +      K    272    320   000080d0   0xffff88007c088780     -1   get_empty_filp
   +      K    272    320   000080d0   0xffff88007c088000     -1   get_empty_filp

Yeah I shall confess kmem_minimalistic should be: kmem_alternative.

Whatever, I find it more readable but this a personal opinion of course.
We can drop it if you want.

On the ALLOC/FREE column, + means an allocation and - a free.

On the type column, you have K = kmalloc, C = cache, P = page

I would like the flags to be GFP_* strings but that would not be easy to not
break the column with strings....

About the node...it seems to always be -1. I don't know why but that shouldn't
be difficult to find.

I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would
be more easy to find the tracer headers if they are all in their common
directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig     |  22 +++
 kernel/trace/Makefile    |   1 +
 kernel/trace/kmemtrace.c | 343 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h     |  25 ++++
 4 files changed, 391 insertions(+)
 create mode 100644 kernel/trace/kmemtrace.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6..27fb74b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -264,6 +264,28 @@ config HW_BRANCH_TRACER
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
+config KMEMTRACE
+	bool "Trace SLAB allocations"
+	select TRACING
+	depends on RELAY
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a9..513dc86 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 0000000..d69cbe3
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,343 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <trace/kmemtrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL	0x1
+
+static struct tracer_opt kmem_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+	{ }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+	.val = 0,
+	.opts = kmem_opts
+};
+
+
+static bool kmem_tracing_enabled __read_mostly;
+static struct trace_array *kmemtrace_array;
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	kmemtrace_array = tr;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+
+	kmem_tracing_enabled = true;
+
+	return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+	kmem_tracing_enabled = false;
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+	/* Don't need headers for the original kmemtrace output */
+	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+		return;
+
+	seq_printf(s, "#\n");
+	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
+			"      POINTER         NODE    CALLER\n");
+	seq_printf(s, "# FREE   |      |     |       |       "
+			"       |   |            |        |\n");
+	seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The two following functions give the original output from kmemtrace,
+ * or something close to....perhaps they need some missing things
+ */
+static enum print_line_t
+kmemtrace_print_alloc_original(struct trace_iterator *iter,
+				struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
+	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
+	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
+	   (unsigned long) entry->gfp_flags, entry->node);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_original(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter,
+					struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Alloc entry */
+	ret = trace_seq_printf(s, "  +      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K   ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C   ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P   ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?   ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Requested */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Allocated */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Flags
+	 * TODO: would be better to see the name of the GFP flag names
+	 */
+	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Node */
+	ret = trace_seq_printf(s, "%4d   ", entry->node);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Free entry */
+	ret = trace_seq_printf(s, "  -      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K     ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C     ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P     ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?     ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip requested/allocated/flags */
+	ret = trace_seq_printf(s, "                       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip node */
+	ret = trace_seq_printf(s, "       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_KMEM_ALLOC: {
+		struct kmemtrace_alloc_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_alloc_compress(iter, field);
+		else
+			return kmemtrace_print_alloc_original(iter, field);
+	}
+
+	case TRACE_KMEM_FREE: {
+		struct kmemtrace_free_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_free_compress(iter, field);
+		else
+			return kmemtrace_print_free_original(iter, field);
+	}
+
+	default:
+		return TRACE_TYPE_UNHANDLED;
+	}
+}
+
+/* Trace allocations */
+void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+			     unsigned long call_site,
+			     const void *ptr,
+			     size_t bytes_req,
+			     size_t bytes_alloc,
+			     gfp_t gfp_flags,
+			     int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+		       unsigned long call_site,
+		       const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+static struct tracer kmem_tracer __read_mostly = {
+	.name		= "kmemtrace",
+	.init		= kmem_trace_init,
+	.reset		= kmem_trace_reset,
+	.print_line	= kmemtrace_print_line,
+	.print_header = kmemtrace_headers,
+	.flags		= &kmem_tracer_flags
+};
+
+static int __init init_kmem_tracer(void)
+{
+	return register_tracer(&kmem_tracer);
+}
+
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f8..534505b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
+#include <trace/kmemtrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -29,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_KMEM_ALLOC,
+	TRACE_KMEM_FREE,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -170,6 +173,24 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct kmemtrace_alloc_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+	size_t bytes_req;
+	size_t bytes_alloc;
+	gfp_t gfp_flags;
+	int node;
+};
+
+struct kmemtrace_free_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -280,6 +301,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
+			  TRACE_KMEM_ALLOC);	\
+		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
+			  TRACE_KMEM_FREE);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
-- 
cgit v1.1


From 3fd4bc015ef879a7d2b955ce97fb125e3a51ba7e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 30 Dec 2008 12:07:27 +0100
Subject: tracing/kmemtrace: export kmemtrace_mark_alloc_node() /
 kmemtrace_mark_free()

Impact: build fix

Also fix up Kconfig dependencies and include files.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig     | 3 ++-
 kernel/trace/kmemtrace.c | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 27fb74b..cc9f91e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -267,7 +267,8 @@ config HW_BRANCH_TRACER
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select TRACING
-	depends on RELAY
+	select MARKERS
+	select RELAY
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index d69cbe3..2bfdcd3 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -296,6 +296,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 
 	trace_wake_up();
 }
+EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
 
 void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 		       unsigned long call_site,
@@ -325,6 +326,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 
 	trace_wake_up();
 }
+EXPORT_SYMBOL(kmemtrace_mark_free);
 
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
-- 
cgit v1.1


From 723cbe0775514853c22dc45005af59c360916af1 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 5 Jan 2009 22:09:58 +0200
Subject: kmemtrace: Remove the relay version of kmemtrace

Impact: cleanup

kmemtrace now uses ftrace. This patch removes the relay version.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cc9f91e..1c0b750 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -267,8 +267,6 @@ config HW_BRANCH_TRACER
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select TRACING
-	select MARKERS
-	select RELAY
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
-- 
cgit v1.1


From 3e80680208ba6ce9635ca7c21ad0019442ea166a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Jan 2009 10:16:35 +0100
Subject: kmemtrace: add kmemtrace_init()

Impact: build fix

leftover from the relayfs version - but we want to keep it because
this call is the earliest opportunity when we can start kmemtrace
tracing. (after kmem_cache_init()).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 2bfdcd3..faaa5ae 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -337,6 +337,11 @@ static struct tracer kmem_tracer __read_mostly = {
 	.flags		= &kmem_tracer_flags
 };
 
+void kmemtrace_init(void)
+{
+	/* earliest opportunity to start kmem tracing */
+}
+
 static int __init init_kmem_tracer(void)
 {
 	return register_tracer(&kmem_tracer);
-- 
cgit v1.1


From 431aa3fbf5bbe3be79809c7e603c2ed2ac64b015 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 6 Jan 2009 12:43:01 -0500
Subject: ftrace: convert unsigned index to signed

Impact: fix to unsigned compared to less than zero

Roel Kluin pointed out that there is a compare of an unsigned number
to less than zero. A previous clean up had the unsigned index set
to -1 for certain cases, but never converted it to signed.

Frederic Weisbecker noticed that another index is used to compare
the above index to and it also needs to be converted to signed.

[
  Converted ftrace_page->index to int from unsigned long as
  Andrew Morton pointed out that there's no need for it to
  stay a long.
]

Reported-by: Roel Kluin <roel.kluin@gmail.com>
Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969..9e54a6c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -289,7 +289,7 @@ static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
-	unsigned long		index;
+	int			index;
 	struct dyn_ftrace	records[];
 };
 
@@ -786,7 +786,7 @@ enum {
 
 struct ftrace_iterator {
 	struct ftrace_page	*pg;
-	unsigned		idx;
+	int			idx;
 	unsigned		flags;
 	unsigned char		buffer[FTRACE_BUFF_MAX+1];
 	unsigned		buffer_idx;
-- 
cgit v1.1


From ff288b274a9b383046fdbda4be3067daba4d5fe8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 6 Jan 2009 21:33:30 +0100
Subject: tracing/ftrace: fix a memory leak in stat tracing

Impact: fix memory leak

This patch fixes a memory leak inside reset_stat_list(). The freeing
loop iterated only once.

Also turn the stat_list into a simple struct list_head, which
simplify the code and avoid an unused static pointer.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 6f194a3..4cb4ff2 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -21,7 +21,7 @@ struct trace_stat_list {
 	void *stat;
 };
 
-static struct trace_stat_list stat_list;
+static LIST_HEAD(stat_list);
 
 /*
  * This is a copy of the current tracer to avoid racy
@@ -39,22 +39,12 @@ static DEFINE_MUTEX(stat_list_mutex);
 
 static void reset_stat_list(void)
 {
-	struct trace_stat_list *node;
-	struct list_head *next;
+	struct trace_stat_list *node, *next;
 
-	if (list_empty(&stat_list.list))
-		return;
-
-	node = list_entry(stat_list.list.next, struct trace_stat_list, list);
-	next = node->list.next;
-
-	while (&node->list != next) {
+	list_for_each_entry_safe(node, next, &stat_list, list)
 		kfree(node);
-		node = list_entry(next, struct trace_stat_list, list);
-	}
-	kfree(node);
 
-	INIT_LIST_HEAD(&stat_list.list);
+	INIT_LIST_HEAD(&stat_list);
 }
 
 void init_tracer_stat(struct tracer *trace)
@@ -107,7 +97,7 @@ static int stat_seq_init(void)
 	}
 
 	INIT_LIST_HEAD(&new_entry->list);
-	list_add(&new_entry->list, &stat_list.list);
+	list_add(&new_entry->list, &stat_list);
 	new_entry->stat = current_tracer.stat_start();
 
 	prev_stat = new_entry->stat;
@@ -130,7 +120,7 @@ static int stat_seq_init(void)
 		if (!new_entry->stat)
 			break;
 
-		list_for_each_entry(iter_entry, &stat_list.list, list) {
+		list_for_each_entry(iter_entry, &stat_list, list) {
 			/* Insertion with a descendent sorting */
 			if (current_tracer.stat_cmp(new_entry->stat,
 						iter_entry->stat) > 0) {
@@ -141,7 +131,7 @@ static int stat_seq_init(void)
 
 			/* The current smaller value */
 			} else if (list_is_last(&iter_entry->list,
-						&stat_list.list)) {
+						&stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
@@ -162,7 +152,7 @@ exit_free_list:
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+	struct list_head *l = (struct list_head *)s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&stat_list_mutex);
@@ -171,14 +161,14 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	if (!*pos && current_tracer.stat_headers)
 		current_tracer.stat_headers(s);
 
-	return seq_list_start(&l->list, *pos);
+	return seq_list_start(l, *pos);
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+	struct list_head *l = (struct list_head *)s->private;
 
-	return seq_list_next(p, &l->list, pos);
+	return seq_list_next(p, l, pos);
 }
 
 static void stat_seq_stop(struct seq_file *m, void *p)
@@ -188,8 +178,10 @@ static void stat_seq_stop(struct seq_file *m, void *p)
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
-	return current_tracer.stat_show(s, l->stat);
+	struct trace_stat_list *entry =
+		list_entry(v, struct trace_stat_list, list);
+
+	return current_tracer.stat_show(s, entry->stat);
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
@@ -237,7 +229,6 @@ static int __init tracing_stat_init(void)
 	struct dentry *d_tracing;
 	struct dentry *entry;
 
-	INIT_LIST_HEAD(&stat_list.list);
 	d_tracing = tracing_init_dentry();
 
 	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
-- 
cgit v1.1


From e8a9cbf6ae620d9e5ba9cb42001c033287a284a3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 6 Jan 2009 22:02:35 -0500
Subject: trace: clean up funny line breaks in stat_seq_show

Impact: clean up

Andrew Morton pointed out that the entry assignment in stat_seq_show
did not need to be done in the declaration, causing funny line breaks.

This patch makes it a bit more pleasing on the eyes.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 4cb4ff2..f110ce9 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -178,8 +178,9 @@ static void stat_seq_stop(struct seq_file *m, void *p)
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *entry =
-		list_entry(v, struct trace_stat_list, list);
+	struct trace_stat_list *entry;
+
+	entry =	list_entry(v, struct trace_stat_list, list);
 
 	return current_tracer.stat_show(s, entry->stat);
 }
-- 
cgit v1.1


From 34a148bf0911a4a1cae85f8ecba57affb4d76aee Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:09 -0800
Subject: kernel/trace/ring_buffer.c: reduce inlining

text    data     bss     dec     hex filename
before:  11320     228       8   11556    2d24 kernel/trace/ring_buffer.o
after:   10592     228       8   10828    2a4c kernel/trace/ring_buffer.o

Also: free_page(0) is legal.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0..9542990 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -133,7 +133,7 @@ enum {
 };
 
 /* inline for ring buffer fast paths */
-static inline unsigned
+static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
 	unsigned length;
@@ -179,7 +179,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
 /* inline for ring buffer fast paths */
-static inline void *
+static void *
 rb_event_data(struct ring_buffer_event *event)
 {
 	BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -229,10 +229,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
  */
-static inline void free_buffer_page(struct buffer_page *bpage)
+static void free_buffer_page(struct buffer_page *bpage)
 {
-	if (bpage->page)
-		free_page((unsigned long)bpage->page);
+	free_page((unsigned long)bpage->page);
 	kfree(bpage);
 }
 
@@ -811,7 +810,7 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static inline int
+static int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	     struct ring_buffer_event *event)
 {
@@ -825,7 +824,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_commit_index(cpu_buffer) == index;
 }
 
-static inline void
+static void
 rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 		    struct ring_buffer_event *event)
 {
@@ -850,7 +849,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 	local_set(&cpu_buffer->commit_page->page->commit, index);
 }
 
-static inline void
+static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	/*
@@ -896,7 +895,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->read = 0;
 }
 
-static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+static void rb_inc_iter(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 
@@ -926,7 +925,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
  * and with this, we can determine what to place into the
  * data field.
  */
-static inline void
+static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
@@ -964,7 +963,7 @@ rb_update_event(struct ring_buffer_event *event,
 	}
 }
 
-static inline unsigned rb_calculate_event_length(unsigned length)
+static unsigned rb_calculate_event_length(unsigned length)
 {
 	struct ring_buffer_event event; /* Used only for sizeof array */
 
@@ -1438,7 +1437,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_write);
 
-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = cpu_buffer->reader_page;
 	struct buffer_page *head = cpu_buffer->head_page;
-- 
cgit v1.1


From 67d347245f76a149c45bffb1a10145d31d61d1da Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:09 -0800
Subject: kernel/trace/ring_buffer.c: use DIV_ROUND_UP

Instead of open-coding it.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9542990..4832ffa 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -123,8 +123,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
 #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
-#define RB_ALIGNMENT_SHIFT	2
-#define RB_ALIGNMENT		(1 << RB_ALIGNMENT_SHIFT)
+#define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
 
 enum {
@@ -151,7 +150,7 @@ rb_event_length(struct ring_buffer_event *event)
 
 	case RINGBUF_TYPE_DATA:
 		if (event->len)
-			length = event->len << RB_ALIGNMENT_SHIFT;
+			length = event->len * RB_ALIGNMENT;
 		else
 			length = event->array[0];
 		return length + RB_EVNT_HDR_SIZE;
@@ -937,15 +936,11 @@ rb_update_event(struct ring_buffer_event *event,
 		break;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len =
-			(RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
-			>> RB_ALIGNMENT_SHIFT;
+		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
 		break;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len =
-			(RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
-			>> RB_ALIGNMENT_SHIFT;
+		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
 	case RINGBUF_TYPE_DATA:
@@ -954,9 +949,7 @@ rb_update_event(struct ring_buffer_event *event,
 			event->len = 0;
 			event->array[0] = length;
 		} else
-			event->len =
-				(length + (RB_ALIGNMENT-1))
-				>> RB_ALIGNMENT_SHIFT;
+			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
-- 
cgit v1.1


From 034939b65ad5ff64b9709210b3469a95153c51a3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 8 Jan 2009 10:03:56 -0800
Subject: tracing/ftrace: handle more than one stat file per tracer

Impact: new API for tracers

Make the stat tracing API reentrant. And also provide the new directory
/debugfs/tracing/trace_stat which will contain all the stat files for the
current active tracer.

Now a tracer will, if desired, want to provide a zero terminated array of
tracer_stat structures.
Each one contains the callbacks necessary for one stat file.
It have to provide at least a name for its stat file, an iterator with
stat_start/start_next callback and an output callback for one stat entry.

Also adapt the branch tracer to this new API.
We create two files "all" and "annotated" inside the /debugfs/tracing/trace_stat
directory, making the both stats simultaneously available instead of needing
to change an option to switch from one stat file to another.

The output of these stats haven't changed.

Changes in v2:

_ Apply the previous memory leak fix (rebase against tip/master)

Changes in v3:

_ Merge the patch that adapted the branch tracer to this Api in this patch to
  not break the kernel build.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h        |  35 ++++---
 kernel/trace/trace_branch.c |  69 ++++++-------
 kernel/trace/trace_stat.c   | 230 ++++++++++++++++++++++++++++++++------------
 3 files changed, 217 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 94ed45e..b3f9ad1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -335,6 +335,25 @@ struct tracer_flags {
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
 /*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+	/* The name of your stat file */
+	const char		*name;
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for sorting (optional) for stats */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
+};
+
+/*
  * A specific tracer, represented by methods that operate on a trace array:
  */
 struct tracer {
@@ -361,21 +380,7 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags 	*flags;
-
-	/*
-	 * If you change one of the following on tracing runtime, recall
-	 * init_tracer_stat()
-	 */
-
-	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
-	void			*(*stat_next)(void *prev, int idx);
-	/* Compare two entries for sorting (optional) for stats */
-	int			(*stat_cmp)(void *p1, void *p2);
-	/* Print a stat entry */
-	int			(*stat_show)(struct seq_file *s, void *p);
-	/* Print the headers of your stat entries */
-	int			(*stat_headers)(struct seq_file *s);
+	struct tracer_stat	*stats;
 };
 
 struct trace_seq {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4785a3b..da5cf3e 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -306,19 +306,6 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 }
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-enum {
-	TRACE_BRANCH_OPT_ALL = 0x1
-};
-
-static struct tracer_opt branch_opts[] = {
-	{ TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) },
-	{ }
-};
-
-static struct tracer_flags branch_flags = {
-	.val = 0,
-	.opts = branch_opts
-};
 
 extern unsigned long __start_branch_profile[];
 extern unsigned long __stop_branch_profile[];
@@ -352,28 +339,36 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static int branch_set_flag(u32 old_flags, u32 bit, int set)
-{
-	if (bit == TRACE_BRANCH_OPT_ALL) {
-		if (set) {
-			branch_trace.stat_headers = all_branch_stat_headers;
-			branch_trace.stat_start = all_branch_stat_start;
-			branch_trace.stat_next = all_branch_stat_next;
-			branch_trace.stat_cmp = NULL;
-		} else {
-			branch_trace.stat_headers =
-				annotated_branch_stat_headers;
-			branch_trace.stat_start = annotated_branch_stat_start;
-			branch_trace.stat_next = annotated_branch_stat_next;
-			branch_trace.stat_cmp = annotated_branch_stat_cmp;
-		}
-		init_tracer_stat(&branch_trace);
-	}
-	return 0;
-}
+static struct tracer_stat branch_stats[] = {
+	{.name = "annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show},
 
+	{.name = "all",
+	.stat_start = all_branch_stat_start,
+	.stat_next = all_branch_stat_next,
+	.stat_headers = all_branch_stat_headers,
+	.stat_show = branch_stat_show},
+
+	{ }
+};
+#else
+static struct tracer_stat branch_stats[] = {
+	{.name = "annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show},
+
+	{ }
+};
 #endif /* CONFIG_PROFILE_ALL_BRANCHES */
 
+
 static struct tracer branch_trace __read_mostly =
 {
 	.name		= "branch",
@@ -383,16 +378,8 @@ static struct tracer branch_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_branch,
 #endif /* CONFIG_FTRACE_SELFTEST */
-#endif /* CONFIG_BRANCH_TRACER */
-	.stat_start	=	annotated_branch_stat_start,
-	.stat_next	= annotated_branch_stat_next,
-	.stat_show	= branch_stat_show,
-	.stat_headers	= annotated_branch_stat_headers,
-	.stat_cmp	= annotated_branch_stat_cmp,
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
-	.flags	= &branch_flags,
-	.set_flag	= branch_set_flag,
 #endif
+	.stats		= branch_stats
 };
 
 __init static int init_branch_trace(void)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f110ce9..1515f9e 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -21,37 +21,87 @@ struct trace_stat_list {
 	void *stat;
 };
 
-static LIST_HEAD(stat_list);
-
-/*
- * This is a copy of the current tracer to avoid racy
- * and dangerous output while the current tracer is
- * switched.
- */
-static struct tracer current_tracer;
+/* A stat session is the stats output in one file */
+struct tracer_stat_session {
+	struct tracer_stat *ts;
+	struct list_head stat_list;
+	struct mutex stat_mutex;
+};
 
-/*
- * Protect both the current tracer and the global
- * stat list.
- */
-static DEFINE_MUTEX(stat_list_mutex);
+/* All of the sessions currently in use. Each stat file embeed one session */
+static struct tracer_stat_session **all_stat_sessions;
+static int nb_sessions;
+static struct dentry *stat_dir, **stat_files;
 
 
-static void reset_stat_list(void)
+static void reset_stat_session(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *node, *next;
 
-	list_for_each_entry_safe(node, next, &stat_list, list)
+	list_for_each_entry_safe(node, next, &session->stat_list, list)
 		kfree(node);
 
-	INIT_LIST_HEAD(&stat_list);
+	INIT_LIST_HEAD(&session->stat_list);
 }
 
-void init_tracer_stat(struct tracer *trace)
+/* Called when a tracer is initialized */
+static int init_all_sessions(int nb, struct tracer_stat *ts)
 {
-	mutex_lock(&stat_list_mutex);
-	current_tracer = *trace;
-	mutex_unlock(&stat_list_mutex);
+	int i, j;
+	struct tracer_stat_session *session;
+
+	nb_sessions = 0;
+
+	if (all_stat_sessions) {
+		for (i = 0; i < nb_sessions; i++) {
+			session = all_stat_sessions[i];
+			reset_stat_session(session);
+			mutex_destroy(&session->stat_mutex);
+			kfree(session);
+		}
+	}
+	all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb,
+				    GFP_KERNEL);
+	if (!all_stat_sessions)
+		return -ENOMEM;
+
+	for (i = 0; i < nb; i++) {
+		session = kmalloc(sizeof(struct tracer_stat_session) * nb,
+				  GFP_KERNEL);
+		if (!session)
+			goto free_sessions;
+
+		INIT_LIST_HEAD(&session->stat_list);
+		mutex_init(&session->stat_mutex);
+		session->ts = &ts[i];
+		all_stat_sessions[i] = session;
+	}
+	nb_sessions = nb;
+	return 0;
+
+free_sessions:
+
+	for (j = 0; j < i; j++)
+		kfree(all_stat_sessions[i]);
+
+	kfree(all_stat_sessions);
+	all_stat_sessions = NULL;
+
+	return -ENOMEM;
+}
+
+static int basic_tracer_stat_checks(struct tracer_stat *ts)
+{
+	int i;
+
+	if (!ts)
+		return 0;
+
+	for (i = 0; ts[i].name; i++) {
+		if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show)
+			return -EBUSY;
+	}
+	return i;
 }
 
 /*
@@ -69,22 +119,19 @@ static int dummy_cmp(void *p1, void *p2)
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(void)
+static int stat_seq_init(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
+	struct tracer_stat *ts = session->ts;
 	void *prev_stat;
 	int ret = 0;
 	int i;
 
-	mutex_lock(&stat_list_mutex);
-	reset_stat_list();
-
-	if (!current_tracer.stat_start || !current_tracer.stat_next ||
-					!current_tracer.stat_show)
-		goto exit;
+	mutex_lock(&session->stat_mutex);
+	reset_stat_session(session);
 
-	if (!current_tracer.stat_cmp)
-		current_tracer.stat_cmp = dummy_cmp;
+	if (!ts->stat_cmp)
+		ts->stat_cmp = dummy_cmp;
 
 	/*
 	 * The first entry. Actually this is the second, but the first
@@ -97,9 +144,10 @@ static int stat_seq_init(void)
 	}
 
 	INIT_LIST_HEAD(&new_entry->list);
-	list_add(&new_entry->list, &stat_list);
-	new_entry->stat = current_tracer.stat_start();
 
+	list_add(&new_entry->list, &session->stat_list);
+
+	new_entry->stat = ts->stat_start();
 	prev_stat = new_entry->stat;
 
 	/*
@@ -114,15 +162,16 @@ static int stat_seq_init(void)
 		}
 
 		INIT_LIST_HEAD(&new_entry->list);
-		new_entry->stat = current_tracer.stat_next(prev_stat, i);
+		new_entry->stat = ts->stat_next(prev_stat, i);
 
 		/* End of insertion */
 		if (!new_entry->stat)
 			break;
 
-		list_for_each_entry(iter_entry, &stat_list, list) {
+		list_for_each_entry(iter_entry, &session->stat_list, list) {
+
 			/* Insertion with a descendent sorting */
-			if (current_tracer.stat_cmp(new_entry->stat,
+			if (ts->stat_cmp(new_entry->stat,
 						iter_entry->stat) > 0) {
 
 				list_add_tail(&new_entry->list,
@@ -131,7 +180,7 @@ static int stat_seq_init(void)
 
 			/* The current smaller value */
 			} else if (list_is_last(&iter_entry->list,
-						&stat_list)) {
+						&session->stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
@@ -140,49 +189,49 @@ static int stat_seq_init(void)
 		prev_stat = new_entry->stat;
 	}
 exit:
-	mutex_unlock(&stat_list_mutex);
+	mutex_unlock(&session->stat_mutex);
 	return ret;
 
 exit_free_list:
-	reset_stat_list();
-	mutex_unlock(&stat_list_mutex);
+	reset_stat_session(session);
+	mutex_unlock(&session->stat_mutex);
 	return ret;
 }
 
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct list_head *l = (struct list_head *)s->private;
+	struct tracer_stat_session *session = s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
-	mutex_lock(&stat_list_mutex);
+	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && current_tracer.stat_headers)
-		current_tracer.stat_headers(s);
+	if (!*pos && session->ts->stat_headers)
+		session->ts->stat_headers(s);
 
-	return seq_list_start(l, *pos);
+	return seq_list_start(&session->stat_list, *pos);
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct list_head *l = (struct list_head *)s->private;
+	struct tracer_stat_session *session = s->private;
 
-	return seq_list_next(p, l, pos);
+	return seq_list_next(p, &session->stat_list, pos);
 }
 
-static void stat_seq_stop(struct seq_file *m, void *p)
+static void stat_seq_stop(struct seq_file *s, void *p)
 {
-	mutex_unlock(&stat_list_mutex);
+	struct tracer_stat_session *session = s->private;
+	mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *entry;
-
-	entry =	list_entry(v, struct trace_stat_list, list);
+	struct tracer_stat_session *session = s->private;
+	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
-	return current_tracer.stat_show(s, entry->stat);
+	return session->ts->stat_show(s, l->stat);
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
@@ -192,15 +241,18 @@ static const struct seq_operations trace_stat_seq_ops = {
 	.show = stat_seq_show
 };
 
+/* The session stat is refilled and resorted at each stat file opening */
 static int tracing_stat_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
+	struct tracer_stat_session *session = inode->i_private;
+
 	ret = seq_open(file, &trace_stat_seq_ops);
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		m->private = &stat_list;
-		ret = stat_seq_init();
+		m->private = session;
+		ret = stat_seq_init(session);
 	}
 
 	return ret;
@@ -212,9 +264,12 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-	mutex_lock(&stat_list_mutex);
-	reset_stat_list();
-	mutex_unlock(&stat_list_mutex);
+	struct tracer_stat_session *session = i->i_private;
+
+	mutex_lock(&session->stat_mutex);
+	reset_stat_session(session);
+	mutex_unlock(&session->stat_mutex);
+
 	return 0;
 }
 
@@ -225,17 +280,70 @@ static const struct file_operations tracing_stat_fops = {
 	.release	= tracing_stat_release
 };
 
+
+static void destroy_trace_stat_files(void)
+{
+	int i;
+
+	if (stat_files) {
+		for (i = 0; i < nb_sessions; i++)
+			debugfs_remove(stat_files[i]);
+		kfree(stat_files);
+		stat_files = NULL;
+	}
+}
+
+static void init_trace_stat_files(void)
+{
+	int i;
+
+	if (!stat_dir || !nb_sessions)
+		return;
+
+	stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL);
+
+	if (!stat_files) {
+		pr_warning("trace stat: not enough memory\n");
+		return;
+	}
+
+	for (i = 0; i < nb_sessions; i++) {
+		struct tracer_stat_session *session = all_stat_sessions[i];
+		stat_files[i] = debugfs_create_file(session->ts->name, 0644,
+						stat_dir,
+						session, &tracing_stat_fops);
+		if (!stat_files[i])
+			pr_warning("cannot create %s entry\n",
+				   session->ts->name);
+	}
+}
+
+void init_tracer_stat(struct tracer *trace)
+{
+	int nb = basic_tracer_stat_checks(trace->stats);
+
+	destroy_trace_stat_files();
+
+	if (nb < 0) {
+		pr_warning("stat tracing: missing stat callback on %s\n",
+			   trace->name);
+		return;
+	}
+	if (!nb)
+		return;
+
+	init_all_sessions(nb, trace->stats);
+	init_trace_stat_files();
+}
+
 static int __init tracing_stat_init(void)
 {
 	struct dentry *d_tracing;
-	struct dentry *entry;
 
 	d_tracing = tracing_init_dentry();
 
-	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
-					NULL,
-				    &tracing_stat_fops);
-	if (!entry)
+	stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+	if (!stat_dir)
 		pr_warning("Could not create debugfs "
 			   "'trace_stat' entry\n");
 	return 0;
-- 
cgit v1.1


From fe6f90e57fd31af8daca534ea01db2e5666c15da Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sat, 3 Jan 2009 21:23:51 +0200
Subject: trace: mmiotrace to the tracer menu in Kconfig

Impact: cosmetic change in Kconfig menu layout

This patch was originally suggested by Peter Zijlstra, but seems it
was forgotten.

CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable
directly under the Kernel hacking / debugging menu in the kernel
configuration system. They were present only for x86 and x86_64.

Other tracers that use the ftrace tracing framework are in their own
sub-menu. This patch moves the mmiotrace configuration options there.
Since the Kconfig file, where the tracer menu is, is not architecture
specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by
x86/x86_64. CONFIG_MMIOTRACE now depends on it.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1c0b750..9442392 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -323,4 +323,27 @@ config FTRACE_STARTUP_TEST
 	  functioning properly. It will do tests on all the configured
 	  tracers of ftrace.
 
+config MMIOTRACE
+	bool "Memory mapped IO tracing"
+	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	select TRACING
+	help
+	  Mmiotrace traces Memory Mapped I/O access and is meant for
+	  debugging and reverse engineering. It is called from the ioremap
+	  implementation and works via page faults. Tracing is disabled by
+	  default and can be enabled at run-time.
+
+	  See Documentation/tracers/mmiotrace.txt.
+	  If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+	tristate "Test module for mmiotrace"
+	depends on MMIOTRACE && m
+	help
+	  This is a dumb module for testing mmiotrace. It is very dangerous
+	  as it will write garbage to IO memory starting at a given address.
+	  However, it should be safe to use on e.g. unused portion of VRAM.
+
+	  Say N, unless you absolutely know what you are doing.
+
 endmenu
-- 
cgit v1.1


From 173ed24ee2d64f5de28654eb456ec1ee18a142e5 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 6 Jan 2009 13:57:11 +0200
Subject: mmiotrace: count events lost due to not recording

Impact: enhances lost events counting in mmiotrace

The tracing framework, or the ring buffer facility it uses, has a switch
to stop recording data. When recording is off, the trace events will be
lost. The framework does not count these, so mmiotrace has to count them
itself.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fcec59f..621c8c3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <asm/atomic.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -20,6 +21,7 @@ struct header_iter {
 static struct trace_array *mmio_trace_array;
 static bool overrun_detected;
 static unsigned long prev_overruns;
+static atomic_t dropped_count;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
@@ -122,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter)
 
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
-	unsigned long cnt = 0;
+	unsigned long cnt = atomic_xchg(&dropped_count, 0);
 	unsigned long over = ring_buffer_overruns(iter->tr->buffer);
 
 	if (over > prev_overruns)
-		cnt = over - prev_overruns;
+		cnt += over - prev_overruns;
 	prev_overruns = over;
 	return cnt;
 }
@@ -308,8 +310,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
@@ -336,8 +340,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
-- 
cgit v1.1


From 25aac9dc7c8c73798c1be8aa36141f980d32579e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:40 +0800
Subject: ftrace, ia64: explictly ignore a file in recordmcount.pl

In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'. MCOUNT_ADDR is determined
at link time not compile time, so explictly ignore kernel/trace/ftrace.o
in recordmcount.pl.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e54a6c..76bb884 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -263,14 +263,6 @@ static void ftrace_update_pid_func(void)
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-/*
- * Since MCOUNT_ADDR may point to mcount itself, we do not want
- * to get it confused by reading a reference in the code as we
- * are parsing on objcopy output of text. Use a variable for
- * it instead.
- */
-static unsigned long mcount_addr = MCOUNT_ADDR;
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -575,7 +567,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 
 	ip = rec->ip;
 
-	ret = ftrace_make_nop(mod, rec, mcount_addr);
+	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
 	if (ret) {
 		ftrace_bug(ret, ip);
 		rec->flags |= FTRACE_FL_FAILED;
-- 
cgit v1.1


From f00012074b1a1a67d9c8603617bbbab267347ca6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:42 +0800
Subject: ftrace, ia64: Add macro for ftrace_caller

Define FTRACE_ADDR. In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76bb884..9f53610 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -455,7 +455,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-	ftrace_addr = (unsigned long)ftrace_caller;
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	ip = rec->ip;
 
-- 
cgit v1.1


From 002bb86d8d42f18937aef396c3ecd65c7e02e21a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 10 Jan 2009 11:34:13 -0800
Subject: tracing/ftrace: separate events tracing and stats tracing engine

Impact: tracing's Api change

Currently, the stat tracing depends on the events tracing.
When you switch to a new tracer, the stats files of the previous tracer
will disappear. But it's more scalable to separate those two engines.
This way, we can keep the stat files of one or several tracers when we
want, without bothering of multiple tracer stat files or tracer switching.

To build/destroys its stats files, a tracer just have to call
register_stat_tracer/unregister_stat_tracer everytimes it wants to.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        |   2 -
 kernel/trace/trace.h        |  20 -----
 kernel/trace/trace_branch.c | 108 ++++++++++++++-----------
 kernel/trace/trace_stat.c   | 191 +++++++++++++++++++-------------------------
 kernel/trace/trace_stat.h   |  31 +++++++
 5 files changed, 172 insertions(+), 180 deletions(-)
 create mode 100644 kernel/trace/trace_stat.h

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0418fc3..40217fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2353,7 +2353,6 @@ static int tracing_set_tracer(char *buf)
 		if (ret)
 			goto out;
 	}
-	init_tracer_stat(t);
 
 	trace_branch_enable(tr);
  out:
@@ -3218,7 +3217,6 @@ __init static int tracer_alloc_buffers(void)
 #else
 	current_trace = &nop_trace;
 #endif
-	init_tracer_stat(current_trace);
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b3f9ad1..79c8721 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,24 +334,6 @@ struct tracer_flags {
 /* Makes more easy to define a tracer opt */
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
-/*
- * If you want to provide a stat file (one-shot statistics), fill
- * an iterator with stat_start/stat_next and a stat_show callbacks.
- * The others callbacks are optional.
- */
-struct tracer_stat {
-	/* The name of your stat file */
-	const char		*name;
-	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
-	void			*(*stat_next)(void *prev, int idx);
-	/* Compare two entries for sorting (optional) for stats */
-	int			(*stat_cmp)(void *p1, void *p2);
-	/* Print a stat entry */
-	int			(*stat_show)(struct seq_file *s, void *p);
-	/* Print the headers of your stat entries */
-	int			(*stat_headers)(struct seq_file *s);
-};
 
 /*
  * A specific tracer, represented by methods that operate on a trace array:
@@ -466,8 +448,6 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
-void init_tracer_stat(struct tracer *trace);
-
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_max_latency;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index da5cf3e..ca017e0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -16,12 +16,12 @@
 #include <asm/local.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 #include "trace_output.h"
 
-static struct tracer branch_trace;
-
 #ifdef CONFIG_BRANCH_TRACER
 
+static struct tracer branch_trace;
 static int branch_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(branch_tracing_mutex);
 
@@ -191,6 +191,30 @@ static struct trace_event trace_branch_event = {
 	.binary		= trace_nop_print,
 };
 
+static struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+};
+
+__init static int init_branch_tracer(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "branch events\n");
+		return 1;
+	}
+	return register_tracer(&branch_trace);
+}
+device_initcall(init_branch_tracer);
+
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -305,6 +329,29 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 		return 0;
 }
 
+static struct tracer_stat annotated_branch_stats = {
+	.name = "branch_annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show
+};
+
+__init static int init_annotated_branch_stats(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&annotated_branch_stats);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "annotated branches stats\n");
+		return 1;
+	}
+	return 0;
+}
+fs_initcall(init_annotated_branch_stats);
+
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
 
 extern unsigned long __start_branch_profile[];
@@ -339,60 +386,25 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static struct tracer_stat branch_stats[] = {
-	{.name = "annotated",
-	.stat_start = annotated_branch_stat_start,
-	.stat_next = annotated_branch_stat_next,
-	.stat_cmp = annotated_branch_stat_cmp,
-	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{.name = "all",
+static struct tracer_stat all_branch_stats = {
+	.name = "branch_all",
 	.stat_start = all_branch_stat_start,
 	.stat_next = all_branch_stat_next,
 	.stat_headers = all_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{ }
-};
-#else
-static struct tracer_stat branch_stats[] = {
-	{.name = "annotated",
-	.stat_start = annotated_branch_stat_start,
-	.stat_next = annotated_branch_stat_next,
-	.stat_cmp = annotated_branch_stat_cmp,
-	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{ }
+	.stat_show = branch_stat_show
 };
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
 
-
-static struct tracer branch_trace __read_mostly =
+__init static int all_annotated_branch_stats(void)
 {
-	.name		= "branch",
-#ifdef CONFIG_BRANCH_TRACER
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif /* CONFIG_FTRACE_SELFTEST */
-#endif
-	.stats		= branch_stats
-};
-
-__init static int init_branch_trace(void)
-{
-#ifdef CONFIG_BRANCH_TRACER
 	int ret;
-	ret = register_ftrace_event(&trace_branch_event);
+
+	ret = register_stat_tracer(&all_branch_stats);
 	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register branch events\n");
+		printk(KERN_WARNING "Warning: could not register "
+				    "all branches stats\n");
 		return 1;
 	}
-#endif
-
-	return register_tracer(&branch_trace);
+	return 0;
 }
-device_initcall(init_branch_trace);
+fs_initcall(all_annotated_branch_stats);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 1515f9e..cb29282 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,28 +10,32 @@
 
 
 #include <linux/list.h>
-#include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include "trace_stat.h"
 #include "trace.h"
 
 
 /* List of stat entries from a tracer */
 struct trace_stat_list {
-	struct list_head list;
-	void *stat;
+	struct list_head 	list;
+	void 			*stat;
 };
 
 /* A stat session is the stats output in one file */
 struct tracer_stat_session {
-	struct tracer_stat *ts;
-	struct list_head stat_list;
-	struct mutex stat_mutex;
+	struct list_head	session_list;
+	struct tracer_stat 	*ts;
+	struct list_head 	stat_list;
+	struct mutex 		stat_mutex;
+	struct dentry		*file;
 };
 
 /* All of the sessions currently in use. Each stat file embeed one session */
-static struct tracer_stat_session **all_stat_sessions;
-static int nb_sessions;
-static struct dentry *stat_dir, **stat_files;
+static LIST_HEAD(all_stat_sessions);
+static DEFINE_MUTEX(all_stat_sessions_mutex);
+
+/* The root directory for all stat files */
+static struct dentry *stat_dir;
 
 
 static void reset_stat_session(struct tracer_stat_session *session)
@@ -44,66 +48,77 @@ static void reset_stat_session(struct tracer_stat_session *session)
 	INIT_LIST_HEAD(&session->stat_list);
 }
 
-/* Called when a tracer is initialized */
-static int init_all_sessions(int nb, struct tracer_stat *ts)
+static void destroy_session(struct tracer_stat_session *session)
 {
-	int i, j;
-	struct tracer_stat_session *session;
+	debugfs_remove(session->file);
+	reset_stat_session(session);
+	mutex_destroy(&session->stat_mutex);
+	kfree(session);
+}
 
-	nb_sessions = 0;
 
-	if (all_stat_sessions) {
-		for (i = 0; i < nb_sessions; i++) {
-			session = all_stat_sessions[i];
-			reset_stat_session(session);
-			mutex_destroy(&session->stat_mutex);
-			kfree(session);
-		}
-	}
-	all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb,
-				    GFP_KERNEL);
-	if (!all_stat_sessions)
-		return -ENOMEM;
+static int init_stat_file(struct tracer_stat_session *session);
 
-	for (i = 0; i < nb; i++) {
-		session = kmalloc(sizeof(struct tracer_stat_session) * nb,
-				  GFP_KERNEL);
-		if (!session)
-			goto free_sessions;
+int register_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *session, *node, *tmp;
+	int ret;
+
+	if (!trace)
+		return -EINVAL;
+
+	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+		return -EINVAL;
 
-		INIT_LIST_HEAD(&session->stat_list);
-		mutex_init(&session->stat_mutex);
-		session->ts = &ts[i];
-		all_stat_sessions[i] = session;
+	/* Already registered? */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace)
+			return -EINVAL;
 	}
-	nb_sessions = nb;
-	return 0;
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	/* Init the session */
+	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
 
-free_sessions:
+	session->ts = trace;
+	INIT_LIST_HEAD(&session->session_list);
+	INIT_LIST_HEAD(&session->stat_list);
+	mutex_init(&session->stat_mutex);
+	session->file = NULL;
 
-	for (j = 0; j < i; j++)
-		kfree(all_stat_sessions[i]);
+	ret = init_stat_file(session);
+	if (ret) {
+		destroy_session(session);
+		return ret;
+	}
 
-	kfree(all_stat_sessions);
-	all_stat_sessions = NULL;
+	/* Register */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_add_tail(&session->session_list, &all_stat_sessions);
+	mutex_unlock(&all_stat_sessions_mutex);
 
-	return -ENOMEM;
+	return 0;
 }
 
-static int basic_tracer_stat_checks(struct tracer_stat *ts)
+void unregister_stat_tracer(struct tracer_stat *trace)
 {
-	int i;
+	struct tracer_stat_session *node, *tmp;
 
-	if (!ts)
-		return 0;
-
-	for (i = 0; ts[i].name; i++) {
-		if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show)
-			return -EBUSY;
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			list_del(&node->session_list);
+			destroy_session(node);
+			break;
+		}
 	}
-	return i;
+	mutex_unlock(&all_stat_sessions_mutex);
 }
 
+
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -280,63 +295,7 @@ static const struct file_operations tracing_stat_fops = {
 	.release	= tracing_stat_release
 };
 
-
-static void destroy_trace_stat_files(void)
-{
-	int i;
-
-	if (stat_files) {
-		for (i = 0; i < nb_sessions; i++)
-			debugfs_remove(stat_files[i]);
-		kfree(stat_files);
-		stat_files = NULL;
-	}
-}
-
-static void init_trace_stat_files(void)
-{
-	int i;
-
-	if (!stat_dir || !nb_sessions)
-		return;
-
-	stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL);
-
-	if (!stat_files) {
-		pr_warning("trace stat: not enough memory\n");
-		return;
-	}
-
-	for (i = 0; i < nb_sessions; i++) {
-		struct tracer_stat_session *session = all_stat_sessions[i];
-		stat_files[i] = debugfs_create_file(session->ts->name, 0644,
-						stat_dir,
-						session, &tracing_stat_fops);
-		if (!stat_files[i])
-			pr_warning("cannot create %s entry\n",
-				   session->ts->name);
-	}
-}
-
-void init_tracer_stat(struct tracer *trace)
-{
-	int nb = basic_tracer_stat_checks(trace->stats);
-
-	destroy_trace_stat_files();
-
-	if (nb < 0) {
-		pr_warning("stat tracing: missing stat callback on %s\n",
-			   trace->name);
-		return;
-	}
-	if (!nb)
-		return;
-
-	init_all_sessions(nb, trace->stats);
-	init_trace_stat_files();
-}
-
-static int __init tracing_stat_init(void)
+static int tracing_stat_init(void)
 {
 	struct dentry *d_tracing;
 
@@ -348,4 +307,16 @@ static int __init tracing_stat_init(void)
 			   "'trace_stat' entry\n");
 	return 0;
 }
-fs_initcall(tracing_stat_init);
+
+static int init_stat_file(struct tracer_stat_session *session)
+{
+	if (!stat_dir && tracing_stat_init())
+		return -ENODEV;
+
+	session->file = debugfs_create_file(session->ts->name, 0644,
+					    stat_dir,
+					    session, &tracing_stat_fops);
+	if (!session->file)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 0000000..202274c
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
+#ifndef __TRACE_STAT_H
+#define __TRACE_STAT_H
+
+#include <linux/seq_file.h>
+
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+	/* The name of your stat file */
+	const char		*name;
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for stats sorting */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
+};
+
+/*
+ * Destroy or create a stat file
+ */
+extern int register_stat_tracer(struct tracer_stat *trace);
+extern void unregister_stat_tracer(struct tracer_stat *trace);
+
+#endif /* __TRACE_STAT_H */
-- 
cgit v1.1


From e1d8aa9f1dd655a3534b22fcfbecb70cdb125766 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Jan 2009 23:15:46 +0100
Subject: tracing: add a new workqueue tracer

Impact: new tracer

The workqueue tracer provides some statistical informations
about each cpu workqueue thread such as the number of the
works inserted and executed since their creation. It can help
to evaluate the amount of work each of them have to perform.
For example it can help a developer to decide whether he should
choose a per cpu workqueue instead of a singlethreaded one.

It only traces statistical informations for now but it will probably later
provide event tracing too.

Such a tracer could help too, and be improved, to help rt priority sorted
workqueue development.

To have a snapshot of the workqueues state at any time, just do

cat /debugfs/tracing/trace_stat/workqueues

Ie:

  1    125        125       reiserfs/1
  1      0          0       scsi_tgtd/1
  1      0          0       aio/1
  1      0          0       ata/1
  1    114        114       kblockd/1
  1      0          0       kintegrityd/1
  1   2147       2147       events/1

  0      0          0       kpsmoused
  0    105        105       reiserfs/0
  0      0          0       scsi_tgtd/0
  0      0          0       aio/0
  0      0          0       ata_aux
  0      0          0       ata/0
  0      0          0       cqueue
  0      0          0       kacpi_notify
  0      0          0       kacpid
  0    149        149       kblockd/0
  0      0          0       kintegrityd/0
  0   1000       1000       khelper
  0   2270       2270       events/0

Changes in V2:

_ Drop the static array based on NR_CPU and dynamically allocate the stat array
  with num_possible_cpus() and other cpu mask facilities....
_ Trace workqueue insertion at a bit lower level (insert_work instead of queue_work) to handle
  even the workqueue barriers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig           |  11 ++
 kernel/trace/Makefile          |   1 +
 kernel/trace/trace_workqueue.c | 287 +++++++++++++++++++++++++++++++++++++++++
 kernel/workqueue.c             |  16 ++-
 4 files changed, 314 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/trace_workqueue.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9442392..dde1d46 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -284,6 +284,17 @@ config KMEMTRACE
 
 	  If unsure, say N.
 
+config WORKQUEUE_TRACER
+	bool "Trace workqueues"
+	select TRACING
+	help
+	  The workqueue tracer provides some statistical informations
+          about each cpu workqueue thread such as the number of the
+          works inserted and executed since their creation. It can help
+          to evaluate the amount of work each of them have to perform.
+          For example it can help a developer to decide whether he should
+          choose a per cpu workqueue instead of a singlethreaded one.
+
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 05c9182..f76d48f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
+obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 0000000..f8118d3
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,287 @@
+/*
+ * Workqueue statistical tracer.
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+
+#include <trace/workqueue.h>
+#include <linux/list.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/* A cpu workqueue thread */
+struct cpu_workqueue_stats {
+	struct list_head            list;
+/* Useful to know if we print the cpu headers */
+	bool		            first_entry;
+	int		            cpu;
+	pid_t 			    pid;
+/* Can be inserted from interrupt or user context, need to be atomic */
+	atomic_t 	            inserted;
+/*
+ *  Don't need to be atomic, works are serialized in a single workqueue thread
+ *  on a single CPU.
+ */
+	unsigned int 	 	    executed;
+};
+
+/* List of workqueue threads on one cpu */
+struct workqueue_global_stats {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+/* Don't need a global lock because allocated before the workqueues, and
+ * never freed.
+ */
+static struct workqueue_global_stats *all_workqueue_stat;
+
+/* Insertion of a work */
+static void
+probe_workqueue_insertion(struct task_struct *wq_thread,
+			  struct work_struct *work)
+{
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			atomic_inc(&node->inserted);
+			goto found;
+		}
+	}
+	pr_debug("trace_workqueue: entry not found\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Execution of a work */
+static void
+probe_workqueue_execution(struct task_struct *wq_thread,
+			  struct work_struct *work)
+{
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			node->executed++;
+			goto found;
+		}
+	}
+	pr_debug("trace_workqueue: entry not found\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Creation of a cpu workqueue thread */
+static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+{
+	struct cpu_workqueue_stats *cws;
+	unsigned long flags;
+
+	WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+
+	/* Workqueues are sometimes created in atomic context */
+	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
+	if (!cws) {
+		pr_warning("trace_workqueue: not enough memory\n");
+		return;
+	}
+	tracing_record_cmdline(wq_thread);
+
+	INIT_LIST_HEAD(&cws->list);
+	cws->cpu = cpu;
+
+	cws->pid = wq_thread->pid;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (list_empty(&all_workqueue_stat[cpu].list))
+		cws->first_entry = true;
+	list_add_tail(&cws->list, &all_workqueue_stat[cpu].list);
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Destruction of a cpu workqueue thread */
+static void probe_workqueue_destruction(struct task_struct *wq_thread)
+{
+	/* Workqueue only execute on one cpu */
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			list_del(&node->list);
+			kfree(node);
+			goto found;
+		}
+	}
+
+	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+}
+
+static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
+{
+	unsigned long flags;
+	struct cpu_workqueue_stats *ret = NULL;
+
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+
+	if (!list_empty(&all_workqueue_stat[cpu].list))
+		ret = list_entry(all_workqueue_stat[cpu].list.next,
+				 struct cpu_workqueue_stats, list);
+
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return ret;
+}
+
+static void *workqueue_stat_start(void)
+{
+	int cpu;
+	void *ret = NULL;
+
+	for_each_possible_cpu(cpu) {
+		ret = workqueue_stat_start_cpu(cpu);
+		if (ret)
+			return ret;
+	}
+	return NULL;
+}
+
+static void *workqueue_stat_next(void *prev, int idx)
+{
+	struct cpu_workqueue_stats *prev_cws = prev;
+	int cpu = prev_cws->cpu;
+	unsigned long flags;
+	void *ret = NULL;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) {
+		spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
+			ret = workqueue_stat_start_cpu(cpu);
+			if (ret)
+				return ret;
+		}
+		return NULL;
+	}
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
+			  list);
+}
+
+static int workqueue_stat_show(struct seq_file *s, void *p)
+{
+	struct cpu_workqueue_stats *cws = p;
+	unsigned long flags;
+	int cpu = cws->cpu;
+
+	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
+		   atomic_read(&cws->inserted),
+		   cws->executed,
+		   trace_find_cmdline(cws->pid));
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (&cws->list == all_workqueue_stat[cpu].list.next)
+		seq_printf(s, "\n");
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return 0;
+}
+
+static int workqueue_stat_headers(struct seq_file *s)
+{
+	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
+	seq_printf(s, "# |      |         |          |\n\n");
+	return 0;
+}
+
+struct tracer_stat workqueue_stats __read_mostly = {
+	.name = "workqueues",
+	.stat_start = workqueue_stat_start,
+	.stat_next = workqueue_stat_next,
+	.stat_show = workqueue_stat_show,
+	.stat_headers = workqueue_stat_headers
+};
+
+
+int __init stat_workqueue_init(void)
+{
+	if (register_stat_tracer(&workqueue_stats)) {
+		pr_warning("Unable to register workqueue stat tracer\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(stat_workqueue_init);
+
+/*
+ * Workqueues are created very early, just after pre-smp initcalls.
+ * So we must register our tracepoints at this stage.
+ */
+int __init trace_workqueue_early_init(void)
+{
+	int ret, cpu;
+
+	ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+	if (ret)
+		goto out;
+
+	ret = register_trace_workqueue_execution(probe_workqueue_execution);
+	if (ret)
+		goto no_insertion;
+
+	ret = register_trace_workqueue_creation(probe_workqueue_creation);
+	if (ret)
+		goto no_execution;
+
+	ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+	if (ret)
+		goto no_creation;
+
+	all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats)
+				     * num_possible_cpus(), GFP_KERNEL);
+
+	if (!all_workqueue_stat) {
+		pr_warning("trace_workqueue: not enough memory\n");
+		goto no_creation;
+	}
+
+	for_each_possible_cpu(cpu) {
+		spin_lock_init(&all_workqueue_stat[cpu].lock);
+		INIT_LIST_HEAD(&all_workqueue_stat[cpu].list);
+	}
+
+	return 0;
+
+no_creation:
+	unregister_trace_workqueue_creation(probe_workqueue_creation);
+no_execution:
+	unregister_trace_workqueue_execution(probe_workqueue_execution);
+no_insertion:
+	unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+out:
+	pr_warning("trace_workqueue: unable to trace workqueues\n");
+
+	return 1;
+}
+early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f44583..1fc2bc2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
+#include <trace/workqueue.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+DEFINE_TRACE(workqueue_insertion);
+
 static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head)
 {
+	trace_workqueue_insertion(cwq->thread, work);
+
 	set_wq_data(work, cwq);
 	/*
 	 * Ensure that we get the right work->data if we see the
@@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+DEFINE_TRACE(workqueue_execution);
+
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
@@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		 */
 		struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
-
+		trace_workqueue_execution(cwq->thread, work);
 		cwq->current_work = work;
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irq(&cwq->lock);
@@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 	return cwq;
 }
 
+DEFINE_TRACE(workqueue_creation);
+
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 	cwq->thread = p;
 
+	trace_workqueue_creation(cwq->thread, cpu);
+
 	return 0;
 }
 
@@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
+DEFINE_TRACE(workqueue_destruction);
+
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
@@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	 * checks list_empty(), and a "normal" queue_work() can't use
 	 * a dead CPU.
 	 */
+	trace_workqueue_destruction(cwq->thread);
 	kthread_stop(cwq->thread);
 	cwq->thread = NULL;
 }
-- 
cgit v1.1


From 32632920a788fb13da35b131b77cc4324c38c1c5 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Date: Mon, 12 Jan 2009 23:35:50 +0100
Subject: ftrace, trivial: fix typo "resgister" -> "register"

Signed-off-by: Uwe Kleine-Koenig <ukleinek@strlen.de>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9f53610..8c1c9c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1894,7 +1894,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 }
 
 /**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
  * @ops - ops structure that holds the function to unregister
  *
  * Unregister a function that was added to be called by ftrace profiling.
-- 
cgit v1.1


From 93d81d1aca26e64a75d06a85f7e128b5f49053e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 15:32:51 +0100
Subject: mutex: small cleanup

Remove a local variable by combining an assingment and test in one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b..357c6d2 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -129,7 +129,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
 	spin_lock_mutex(&lock->wait_lock, flags);
@@ -142,8 +141,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +156,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
-- 
cgit v1.1


From 41719b03091911028116155deddc5eedf8c45e37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 15:36:26 +0100
Subject: mutex: preemption fixes

The problem is that dropping the spinlock right before schedule is a voluntary
preemption point and can cause a schedule, right after which we schedule again.

Fix this inefficiency by keeping preemption disabled until we schedule, do this
by explicity disabling preemption and providing a schedule() variant that
assumes preemption is already disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c |  5 ++++-
 kernel/sched.c | 10 +++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 357c6d2..524ffc3 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,6 +131,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct mutex_waiter waiter;
 	unsigned long flags;
 
+	preempt_disable();
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -170,13 +171,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
@@ -193,6 +195,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13..b001c13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4538,15 +4538,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4603,7 +4601,13 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
-- 
cgit v1.1


From 0d66bf6d3514b35eb6897629059443132992dbd7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 Jan 2009 14:01:47 +0100
Subject: mutex: implement adaptive spinning

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo's test-mutex application (http://lkml.org/lkml/2006/1/8/50)
gave a 345% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               296604

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               85870

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it'll release the lock soon, and thus we'll be better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex-debug.c    |   9 +---
 kernel/mutex-debug.h    |  18 ++++----
 kernel/mutex.c          | 115 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/mutex.h          |  22 ++++++++-
 kernel/sched.c          |  61 +++++++++++++++++++++++++
 kernel/sched_features.h |   1 +
 6 files changed, 197 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160..50d022e 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdf..6b2d735 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 524ffc3..ff42e97 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 	 * 'unlocked' into 'locked' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
 	 * The unlocking fastpath is the 0->1 transition from 'locked'
 	 * into 'unlocked' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -132,10 +147,71 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	unsigned long flags;
 
 	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn't track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+	 * to serialize everything.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there are pending waiters, join them.
+		 */
+		if (!list_empty(&lock->wait_list))
+			break;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
@@ -185,8 +261,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -222,7 +298,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +337,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +373,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +439,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +468,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075daf..67578ca 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index b001c13..589e730 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4614,6 +4614,67 @@ need_resched:
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b..07bc02e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)
-- 
cgit v1.1


From ac6e60ee405aa3bf718f7fe4cb01b7ee0b8877ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Jan 2009 17:29:31 +0100
Subject: mutex: adaptive spinnning, performance tweaks

Spin more agressively. This is less fair but also markedly faster.

The numbers:

 * dbench 50 (higher is better):
  spin        1282MB/s
  v10         548MB/s
  v10 no wait 1868MB/s

 * 4k creates (numbers in files/second higher is better):
  spin        avg 200.60 median 193.20 std 19.71 high 305.93 low 186.82
  v10         avg 180.94 median 175.28 std 13.91 high 229.31 low 168.73
  v10 no wait avg 232.18 median 222.38 std 22.91 high 314.66 low 209.12

 * File stats (numbers in seconds, lower is better):
  spin        2.27s
  v10         5.1s
  v10 no wait 1.6s

( The source changes are smaller than they look, I just moved the
  need_resched checks in __mutex_lock_common after the cmpxchg. )

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index ff42e97..5d79781 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -171,12 +171,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		struct thread_info *owner;
 
 		/*
-		 * If there are pending waiters, join them.
-		 */
-		if (!list_empty(&lock->wait_list))
-			break;
-
-		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
@@ -184,6 +178,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		if (owner && !mutex_spin_on_owner(lock, owner))
 			break;
 
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
 		/*
 		 * When there's no owner, we might have preempted between the
 		 * owner acquiring the lock and setting the owner field. If
@@ -193,13 +194,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		if (!owner && (need_resched() || rt_task(task)))
 			break;
 
-		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
-			lock_acquired(&lock->dep_map, ip);
-			mutex_set_owner(lock);
-			preempt_enable();
-			return 0;
-		}
-
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don't need
-- 
cgit v1.1


From 428aee1460a75197f0190534b4d610450ee59af1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 14 Jan 2009 12:24:42 -0500
Subject: trace: print ftrace_dump at KERN_EMERG log level

Impact: fix to print out ftrace_dump when expected

I was debugging a hard race condition to only find out that
after I hit the race, my log level was not at level to show
KERN_INFO. The time it took to trigger the race was wasted because
I did not capture the trace.

Since ftrace_dump is only called from kernel oops (and only when
it is set in the kernel command line to do so), or when a
developer adds it to their own local tree, the log level of
the print should be at KERN_EMERG to make sure the print appears.

ftrace_dump is not called by a normal user setup, and will not
add extra unwanted print out to the console. There is no reason
it should be at KERN_INFO.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 40217fb..408c03f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3076,7 +3076,7 @@ static struct notifier_block trace_die_notifier = {
  * it if we decide to change what log level the ftrace dump
  * should be at.
  */
-#define KERN_TRACE		KERN_INFO
+#define KERN_TRACE		KERN_EMERG
 
 static void
 trace_printk_seq(struct trace_seq *s)
-- 
cgit v1.1


From 6f3b34402e7282cde49dff395d7ea462bf33bf50 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 12 Jan 2009 11:06:18 +0800
Subject: ring_buffer: reset write when reserve buffer fail

Impact: reset struct buffer_page.write when interrupt storm

if struct buffer_page.write is not reset, any succedent committing
will corrupted ring_buffer:

static inline void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
	......
		cpu_buffer->commit_page->commit =
			cpu_buffer->commit_page->write;
	......
}

when "if (RB_WARN_ON(cpu_buffer, next_page == reader_page))", ring_buffer
is disabled, but some reserved buffers may haven't been committed.
we need reset struct buffer_page.write.

when "if (unlikely(next_page == cpu_buffer->commit_page))", ring_buffer
is still available, we should not corrupt it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4832ffa..0b9de5a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1017,12 +1017,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		}
 
 		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE)) {
-				/* reset write */
-				if (tail <= BUF_PAGE_SIZE)
-					local_set(&tail_page->write, tail);
+			if (!(buffer->flags & RB_FL_OVERWRITE))
 				goto out_unlock;
-			}
 
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
@@ -1097,6 +1093,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 
  out_unlock:
+	/* reset write */
+	if (tail <= BUF_PAGE_SIZE)
+		local_set(&tail_page->write, tail);
+
 	__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
-- 
cgit v1.1


From 0ee6b6cf5bdb793b4c68507dd65adf16341aa4ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 14 Jan 2009 14:50:19 -0500
Subject: trace: stop all recording to ring buffer on ftrace_dump

Impact: limit ftrace dump output

Currently ftrace_dump only calls ftrace_kill that is a fast way
to prevent the function tracer functions from being called (just sets
a flag and clears the function to call, nothing else). It is better
to also turn off any recording to the ring buffers as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 408c03f..dcb757f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3110,6 +3110,7 @@ void ftrace_dump(void)
 	dump_ran = 1;
 
 	/* No turning back! */
+	tracing_off();
 	ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
-- 
cgit v1.1


From 4a2b8dda3f8705880ec7408135645602d5590f51 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Jan 2009 13:33:27 -0800
Subject: tracing/function-graph-tracer: fix a regression while suspend to disk

Impact: fix a crash while kernel image restore

When the function graph tracer is running and while suspend to disk, some racy
and dangerous things happen against this tracer.

The current task will save its registers including the stack pointer which
contains the return address hooked by the tracer. But the current task will
continue to enter other functions after that to save the memory, and then
it will store other return addresses, and finally loose the old depth which
matches the return address saved in the old stack (during the registers saving).

So on image restore, the code will return to wrong addresses.
And there are other things: on restore, the task will have it's "current"
pointer overwritten during registers restoring....switching from one task to
another... That would be insane to try to trace function graphs at these
stages.

This patch makes the function graph tracer listening on power events, making
it's tracing disabled for the current task (the one that performs the
hibernation work) while suspend/resume to disk, making the tracing safe
during hibernation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c1c9c0..7e9a20b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
 #include <linux/clocksource.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 #include <linux/debugfs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
@@ -1957,6 +1958,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 static atomic_t ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 {
@@ -2035,6 +2037,27 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+							void *unused)
+{
+	switch (state) {
+	case PM_HIBERNATION_PREPARE:
+		pause_graph_tracing();
+		break;
+
+	case PM_POST_HIBERNATION:
+		unpause_graph_tracing();
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 			trace_func_graph_ent_t entryfunc)
 {
@@ -2042,6 +2065,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_sysctl_lock);
 
+	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+	register_pm_notifier(&ftrace_suspend_notifier);
+
 	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
@@ -2067,6 +2093,7 @@ void unregister_ftrace_graph(void)
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+	unregister_pm_notifier(&ftrace_suspend_notifier);
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
-- 
cgit v1.1


From 42fab4b2cdc02cf28e2474ccfd75bc9225076590 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 09:30:52 +0800
Subject: tracing/ftrace: add missing unlock in register_stat_tracer()

We should unlock all_stat_sessions_mutex before returning failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index cb29282..2110cea 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,8 +73,10 @@ int register_stat_tracer(struct tracer_stat *trace)
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace)
+		if (node->ts == trace) {
+			mutex_unlock(&all_stat_sessions_mutex);
 			return -EINVAL;
+		}
 	}
 	mutex_unlock(&all_stat_sessions_mutex);
 
-- 
cgit v1.1


From 55922173f1f63903b6de03711ab8ff980cbe58d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 15 Jan 2009 11:31:21 +0100
Subject: tracing: trace_stat.c cleanup

Impact: cleanup

- whitespace / code alignment cleanups
- avoid unnecessary forward prototype by reordering functions

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 147 ++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 2110cea..eae9cef 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -17,16 +17,16 @@
 
 /* List of stat entries from a tracer */
 struct trace_stat_list {
-	struct list_head 	list;
-	void 			*stat;
+	struct list_head	list;
+	void			*stat;
 };
 
 /* A stat session is the stats output in one file */
 struct tracer_stat_session {
 	struct list_head	session_list;
-	struct tracer_stat 	*ts;
-	struct list_head 	stat_list;
-	struct mutex 		stat_mutex;
+	struct tracer_stat	*ts;
+	struct list_head	stat_list;
+	struct mutex		stat_mutex;
 	struct dentry		*file;
 };
 
@@ -35,7 +35,7 @@ static LIST_HEAD(all_stat_sessions);
 static DEFINE_MUTEX(all_stat_sessions_mutex);
 
 /* The root directory for all stat files */
-static struct dentry *stat_dir;
+static struct dentry		*stat_dir;
 
 
 static void reset_stat_session(struct tracer_stat_session *session)
@@ -56,71 +56,6 @@ static void destroy_session(struct tracer_stat_session *session)
 	kfree(session);
 }
 
-
-static int init_stat_file(struct tracer_stat_session *session);
-
-int register_stat_tracer(struct tracer_stat *trace)
-{
-	struct tracer_stat_session *session, *node, *tmp;
-	int ret;
-
-	if (!trace)
-		return -EINVAL;
-
-	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
-		return -EINVAL;
-
-	/* Already registered? */
-	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			mutex_unlock(&all_stat_sessions_mutex);
-			return -EINVAL;
-		}
-	}
-	mutex_unlock(&all_stat_sessions_mutex);
-
-	/* Init the session */
-	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
-	if (!session)
-		return -ENOMEM;
-
-	session->ts = trace;
-	INIT_LIST_HEAD(&session->session_list);
-	INIT_LIST_HEAD(&session->stat_list);
-	mutex_init(&session->stat_mutex);
-	session->file = NULL;
-
-	ret = init_stat_file(session);
-	if (ret) {
-		destroy_session(session);
-		return ret;
-	}
-
-	/* Register */
-	mutex_lock(&all_stat_sessions_mutex);
-	list_add_tail(&session->session_list, &all_stat_sessions);
-	mutex_unlock(&all_stat_sessions_mutex);
-
-	return 0;
-}
-
-void unregister_stat_tracer(struct tracer_stat *trace)
-{
-	struct tracer_stat_session *node, *tmp;
-
-	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			list_del(&node->session_list);
-			destroy_session(node);
-			break;
-		}
-	}
-	mutex_unlock(&all_stat_sessions_mutex);
-}
-
-
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -252,10 +187,10 @@ static int stat_seq_show(struct seq_file *s, void *v)
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
-	.start = stat_seq_start,
-	.next = stat_seq_next,
-	.stop = stat_seq_stop,
-	.show = stat_seq_show
+	.start		= stat_seq_start,
+	.next		= stat_seq_next,
+	.stop		= stat_seq_stop,
+	.show		= stat_seq_show
 };
 
 /* The session stat is refilled and resorted at each stat file opening */
@@ -275,7 +210,6 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-
 /*
  * Avoid consuming memory with our now useless list.
  */
@@ -322,3 +256,64 @@ static int init_stat_file(struct tracer_stat_session *session)
 		return -ENOMEM;
 	return 0;
 }
+
+int register_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *session, *node, *tmp;
+	int ret;
+
+	if (!trace)
+		return -EINVAL;
+
+	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+		return -EINVAL;
+
+	/* Already registered? */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			mutex_unlock(&all_stat_sessions_mutex);
+			return -EINVAL;
+		}
+	}
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	/* Init the session */
+	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
+
+	session->ts = trace;
+	INIT_LIST_HEAD(&session->session_list);
+	INIT_LIST_HEAD(&session->stat_list);
+	mutex_init(&session->stat_mutex);
+	session->file = NULL;
+
+	ret = init_stat_file(session);
+	if (ret) {
+		destroy_session(session);
+		return ret;
+	}
+
+	/* Register */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_add_tail(&session->session_list, &all_stat_sessions);
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	return 0;
+}
+
+void unregister_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *node, *tmp;
+
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			list_del(&node->session_list);
+			destroy_session(node);
+			break;
+		}
+	}
+	mutex_unlock(&all_stat_sessions_mutex);
+}
-- 
cgit v1.1


From 6c1a99afbda99cd8d8c69d756387041567a13d87 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 18:05:40 +0800
Subject: ftrace: fix trace_output

Impact: fix bug for handling partial line

trace_seq_printf(), seq_print_userip_objs(), ... return
0          -- partial line was written
other(>0)  -- success

duplicate output is also removed in trace_print_raw().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 65 +++++++++++++++++++++------------------------
 kernel/trace/trace_output.h |  4 +--
 2 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df0c25c..4e3ad36 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -440,9 +440,9 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (trace_seq_printf(s, "%x %x\n",
-			     field->ip,
-			     field->parent_ip))
+	if (!trace_seq_printf(s, "%x %x\n",
+			      field->ip,
+			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -497,14 +497,14 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
 	comm = trace_find_cmdline(field->next_pid);
-	if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-			     field->prev_pid,
-			     field->prev_prio,
-			     S, delim,
-			     field->next_cpu,
-			     field->next_pid,
-			     field->next_prio,
-			     T, comm))
+	if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+			      field->prev_pid,
+			      field->prev_prio,
+			      S, delim,
+			      field->next_cpu,
+			      field->next_pid,
+			      field->next_prio,
+			      T, comm))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -534,14 +534,14 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
 	if (!S)
 		task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
-	if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
-			     field->prev_pid,
-			     field->prev_prio,
-			     S,
-			     field->next_cpu,
-			     field->next_pid,
-			     field->next_prio,
-			     T))
+	if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+			      field->prev_pid,
+			      field->prev_prio,
+			      S,
+			      field->next_cpu,
+			      field->next_pid,
+			      field->next_prio,
+			      T))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -639,10 +639,10 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (trace_seq_printf(s, "# %ld %ld %ld\n",
-			     field->arg1,
-			     field->arg2,
-			     field->arg3))
+	if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+			      field->arg1,
+			      field->arg2,
+			      field->arg3))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -697,13 +697,13 @@ trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (i) {
-			if (trace_seq_puts(s, " <= "))
+			if (!trace_seq_puts(s, " <= "))
 				goto partial;
 
-			if (seq_print_ip_sym(s, field->caller[i], flags))
+			if (!seq_print_ip_sym(s, field->caller[i], flags))
 				goto partial;
 		}
-		if (trace_seq_puts(s, "\n"))
+		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
 
@@ -731,10 +731,10 @@ trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_userip_objs(field, s, flags))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
-	if (trace_seq_putc(s, '\n'))
+	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
 	return 0;
@@ -760,10 +760,10 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_ip_sym(s, field->ip, flags))
+	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
 
-	if (trace_seq_printf(s, ": %s", field->buf))
+	if (!trace_seq_printf(s, ": %s", field->buf))
 		goto partial;
 
 	return 0;
@@ -779,10 +779,7 @@ trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+	if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ecab4ea..b2c1461 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -45,14 +45,14 @@ trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
 #define SEQ_PUT_FIELD_RET(s, x)				\
 do {							\
 	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
-		return 0;				\
+		return TRACE_TYPE_PARTIAL_LINE;		\
 } while (0)
 
 #define SEQ_PUT_HEX_FIELD_RET(s, x)			\
 do {							\
 	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
 	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
-		return 0;				\
+		return TRACE_TYPE_PARTIAL_LINE;		\
 } while (0)
 
 #endif
-- 
cgit v1.1


From 5361499101306cfb776c3cfa0f69d0479bc63868 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 19:12:40 -0500
Subject: ftrace: add stack trace to function tracer

Impact: new feature to stack trace any function

Chris Mason asked about being able to pick and choose a function
and get a stack trace from it. This feature enables his request.

 # echo io_schedule > /debug/tracing/set_ftrace_filter
 # echo function > /debug/tracing/current_tracer
 # echo func_stack_trace > /debug/tracing/trace_options

Produces the following in /debug/tracing/trace:

       kjournald-702   [001]   135.673060: io_schedule <-sync_buffer
       kjournald-702   [002]   135.673671:
 <= sync_buffer
 <= __wait_on_bit
 <= out_of_line_wait_on_bit
 <= __wait_on_buffer
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald

Note, be careful about turning this on without filtering the functions.
You may find that you have a 10 second lag between typing and seeing
what you typed. This is why the stack trace for the function tracer
does not use the same stack_trace flag as the other tracers use.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           | 26 ++++++++-----
 kernel/trace/trace.h           |  7 ++++
 kernel/trace/trace_functions.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dcb757f..3c54cb1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -835,10 +835,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags, pc);
 }
 
-static void ftrace_trace_stack(struct trace_array *tr,
-			       struct trace_array_cpu *data,
-			       unsigned long flags,
-			       int skip, int pc)
+static void __ftrace_trace_stack(struct trace_array *tr,
+				 struct trace_array_cpu *data,
+				 unsigned long flags,
+				 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
@@ -846,9 +846,6 @@ static void ftrace_trace_stack(struct trace_array *tr,
 	struct stack_trace trace;
 	unsigned long irq_flags;
 
-	if (!(trace_flags & TRACE_ITER_STACKTRACE))
-		return;
-
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
 	if (!event)
@@ -869,12 +866,23 @@ static void ftrace_trace_stack(struct trace_array *tr,
 #endif
 }
 
+static void ftrace_trace_stack(struct trace_array *tr,
+			       struct trace_array_cpu *data,
+			       unsigned long flags,
+			       int skip, int pc)
+{
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	__ftrace_trace_stack(tr, data, flags, skip, pc);
+}
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
-		   int skip)
+		   int skip, int pc)
 {
-	ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+	__ftrace_trace_stack(tr, data, flags, skip, pc);
 }
 
 static void ftrace_trace_userstack(struct trace_array *tr,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79c8721..bf39a36 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -457,6 +457,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip, int pc);
+
 extern cycle_t ftrace_now(int cpu);
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -467,6 +472,8 @@ void tracing_stop_function_trace(void);
 # define tracing_stop_function_trace()		do { } while (0)
 #endif
 
+extern int ftrace_function_enabled;
+
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e..3a5fa08 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,6 +16,8 @@
 
 #include "trace.h"
 
+static struct trace_array	*func_trace;
+
 static void start_function_trace(struct trace_array *tr)
 {
 	tr->cpu = get_cpu();
@@ -34,6 +36,7 @@ static void stop_function_trace(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
+	func_trace = tr;
 	start_function_trace(tr);
 	return 0;
 }
@@ -48,12 +51,93 @@ static void function_trace_start(struct trace_array *tr)
 	tracing_reset_online_cpus(tr);
 }
 
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		/*
+		 * skip over 5 funcs:
+		 *    __ftrace_trace_stack,
+		 *    __trace_stack,
+		 *    function_stack_trace_call
+		 *    ftrace_list_func
+		 *    ftrace_call
+		 */
+		__trace_stack(tr, data, flags, 5, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_stack_ops __read_mostly =
+{
+	.func = function_stack_trace_call,
+};
+
+/* Our two options */
+enum {
+	TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static struct tracer_opt func_opts[] = {
+#ifdef CONFIG_STACKTRACE
+	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
+#endif
+	{ } /* Always set a last empty entry */
+};
+
+static struct tracer_flags func_flags = {
+	.val = 0, /* By default: all flags disabled */
+	.opts = func_opts
+};
+
+static int func_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_FUNC_OPT_STACK) {
+		/* do nothing if already set */
+		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
+			return 0;
+
+		if (set)
+			register_ftrace_function(&trace_stack_ops);
+		else
+			unregister_ftrace_function(&trace_stack_ops);
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static struct tracer function_trace __read_mostly =
 {
 	.name	     = "function",
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
 	.start	     = function_trace_start,
+	.flags		= &func_flags,
+	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
 #endif
-- 
cgit v1.1


From bb3c3c95f330f7bf16e33b002e48882616089db1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 20:40:23 -0500
Subject: ftrace: move function tracer functions out of trace.c

Impact: clean up of trace.c

The function tracer functions were put in trace.c because it needed
to share static variables that were in trace.c.  Since then, those
variables have become global for various reasons. This patch moves
the function tracer functions into trace_function.c where they belong.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           | 84 ------------------------------------------
 kernel/trace/trace_functions.c | 84 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c54cb1..2585ffb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1046,65 +1046,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_TRACER
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu, resched;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	local_save_flags(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
-
-	atomic_dec(&data->disabled);
-	ftrace_preempt_enable(resched);
-}
-
-static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	/*
-	 * Need to use raw, since this must be called before the
-	 * recursive protection is performed.
-	 */
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
-	}
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
@@ -1162,31 +1103,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = function_trace_call,
-};
-
-void tracing_start_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
-	register_ftrace_function(&trace_ops);
-	ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
-}
-#endif
-
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 3a5fa08..2dce3c7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -20,6 +20,7 @@ static struct trace_array	*func_trace;
 
 static void start_function_trace(struct trace_array *tr)
 {
+	func_trace = tr;
 	tr->cpu = get_cpu();
 	tracing_reset_online_cpus(tr);
 	put_cpu();
@@ -36,7 +37,6 @@ static void stop_function_trace(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
-	func_trace = tr;
 	start_function_trace(tr);
 	return 0;
 }
@@ -52,6 +52,64 @@ static void function_trace_start(struct trace_array *tr)
 }
 
 static void
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu, resched;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	local_save_flags(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+
+	atomic_dec(&data->disabled);
+	ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct trace_array *tr = func_trace;
@@ -90,6 +148,30 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
+	register_ftrace_function(&trace_ops);
+	ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+}
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
 	.func = function_stack_trace_call,
-- 
cgit v1.1


From c37abc5515b5ed5b1d2134d2deaead492d9f92a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 20:50:54 -0500
Subject: trace: add gcc printf check to trace_seq_printf

Andrew Morton suggested adding a printf checker to trace_seq_printf
since there are a number of users that have improper format arguments.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index b2c1461..1cbab5e 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,7 +16,8 @@ struct trace_event {
 	trace_print_func	binary;
 };
 
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-- 
cgit v1.1


From 5e4abc9839191e213965e0f1dbf36e2e44356c3a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 21:00:50 -0500
Subject: trace: clean up format errors in calls to trace_seq_printf

After adding the printf format checking for trace_seq_printf, several
warnings now show up. This patch cleans them up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c       |  4 ++--
 kernel/trace/trace_mmiotrace.c | 13 +++++++------
 kernel/trace/trace_output.c    |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index faaa5ae..7ebc58c 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 621c8c3..ec78e24 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -184,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
 	switch (rw->opcode) {
 	case MMIO_READ:
 		ret = trace_seq_printf(s,
-			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
-			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
-			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
+			"%02lx 0x%lx %d\n",
 			secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -230,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
 	switch (m->opcode) {
 	case MMIO_PROBE:
 		ret = trace_seq_printf(s,
-			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
 			secs, usec_rem, m->map_id,
 			(unsigned long long)m->phys, m->virt, m->len,
 			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
 		ret = trace_seq_printf(s,
-			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			"UNMAP %u.%06lu %d 0x%lx %d\n",
 			secs, usec_rem, m->map_id, 0UL, 0);
 		break;
 	default:
@@ -261,7 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4e3ad36..1a4e144 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -440,7 +440,7 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (!trace_seq_printf(s, "%x %x\n",
+	if (!trace_seq_printf(s, "%lx %lx\n",
 			      field->ip,
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.1


From 3eb36aa05329a47cbe201c151fd0024a4a3649cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 22:21:43 -0500
Subject: ftrace: combine stack trace in function call

Impact: less likely to interleave function and stack traces

This patch does replaces the separate stack trace on function with
a record function and stack trace together. This will switch between
the function only recording to a function and stack recording.

Also some whitespace fix ups as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions.c | 61 +++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2dce3c7..61d0b73 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -133,6 +133,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
 		/*
 		 * skip over 5 funcs:
 		 *    __ftrace_trace_stack,
@@ -154,24 +155,6 @@ static struct ftrace_ops trace_ops __read_mostly =
 	.func = function_trace_call,
 };
 
-void tracing_start_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
-	register_ftrace_function(&trace_ops);
-	ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
-}
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
 	.func = function_stack_trace_call,
@@ -194,6 +177,31 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
+void tracing_start_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
+	if (func_flags.val & TRACE_FUNC_OPT_STACK)
+		register_ftrace_function(&trace_stack_ops);
+	else
+		register_ftrace_function(&trace_ops);
+
+	ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+	/* OK if they are not registered */
+	unregister_ftrace_function(&trace_stack_ops);
+	unregister_ftrace_function(&trace_ops);
+}
+
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
 	if (bit == TRACE_FUNC_OPT_STACK) {
@@ -201,10 +209,13 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
 		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
 			return 0;
 
-		if (set)
+		if (set) {
+			unregister_ftrace_function(&trace_ops);
 			register_ftrace_function(&trace_stack_ops);
-		else
+		} else {
 			unregister_ftrace_function(&trace_stack_ops);
+			register_ftrace_function(&trace_ops);
+		}
 
 		return 0;
 	}
@@ -214,14 +225,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
 
 static struct tracer function_trace __read_mostly =
 {
-	.name	     = "function",
-	.init	     = function_trace_init,
-	.reset	     = function_trace_reset,
-	.start	     = function_trace_start,
+	.name		= "function",
+	.init		= function_trace_init,
+	.reset		= function_trace_reset,
+	.start		= function_trace_start,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_function,
+	.selftest	= trace_selftest_startup_function,
 #endif
 };
 
-- 
cgit v1.1


From a225cdd263f340c864febb1992802fb5b08bc328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 23:06:03 -0500
Subject: ftrace: remove static from function tracer functions

Impact: clean up

After reorganizing the functions in trace.c and trace_function.c,
they no longer need to be in global context. This patch makes the
functions and one variable into static.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           |  3 ---
 kernel/trace/trace.h           | 10 ----------
 kernel/trace/trace_functions.c | 10 ++++++++--
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2585ffb..7de6a94 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -187,9 +187,6 @@ int tracing_is_enabled(void)
 	return tracer_enabled;
 }
 
-/* function tracing enabled */
-int				ftrace_function_enabled;
-
 /*
  * trace_buf_size is the size in bytes that is allocated
  * for a buffer. Note, the number of bytes is always rounded
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bf39a36..54b7278 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -464,16 +464,6 @@ void __trace_stack(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_FUNCTION_TRACER
-void tracing_start_function_trace(void);
-void tracing_stop_function_trace(void);
-#else
-# define tracing_start_function_trace()		do { } while (0)
-# define tracing_stop_function_trace()		do { } while (0)
-#endif
-
-extern int ftrace_function_enabled;
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 61d0b73..b3a320f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,8 +16,14 @@
 
 #include "trace.h"
 
+/* function tracing enabled */
+static int			ftrace_function_enabled;
+
 static struct trace_array	*func_trace;
 
+static void tracing_start_function_trace(void);
+static void tracing_stop_function_trace(void);
+
 static void start_function_trace(struct trace_array *tr)
 {
 	func_trace = tr;
@@ -177,7 +183,7 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
-void tracing_start_function_trace(void)
+static void tracing_start_function_trace(void)
 {
 	ftrace_function_enabled = 0;
 
@@ -194,7 +200,7 @@ void tracing_start_function_trace(void)
 	ftrace_function_enabled = 1;
 }
 
-void tracing_stop_function_trace(void)
+static void tracing_stop_function_trace(void)
 {
 	ftrace_function_enabled = 0;
 	/* OK if they are not registered */
-- 
cgit v1.1


From 745b1626dd71ce9661a05ea4db57859ed5c773d2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 23:40:11 -0500
Subject: trace: set max latency variable to zero on default

Impact: trace max latencies on start of latency tracing

This patch sets the max latency to zero whenever one of the
irq variant tracers or the wakeup tracer is set to current tracer.

Most developers expect to see output when starting up a latency
tracer. But since the max_latency is already set to max, and
it takes a latency greater than max_latency to be recorded, there
is no trace. This is not the expected behavior and has even confused
myself.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 2 +-
 kernel/trace/trace_irqsoff.c      | 1 +
 kernel/trace/trace_sched_wakeup.c | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7de6a94..220c264 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,7 +41,7 @@
 
 #define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
 
-unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_max_latency;
 unsigned long __read_mostly	tracing_thresh;
 
 /*
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326..62a78d9 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+	tracing_max_latency = 0;
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b6..42ae1e7 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int wakeup_tracer_init(struct trace_array *tr)
 {
+	tracing_max_latency = 0;
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
 	return 0;
-- 
cgit v1.1


From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 15 Jan 2009 11:08:40 -0800
Subject: softlockup: decouple hung tasks check from softlockup detection

Decoupling allows:

* hung tasks check to happen at very low priority

* hung tasks check and softlockup to be enabled/disabled independently
  at compile and/or run-time

* individual panic settings to be enabled disabled independently
  at compile and/or run-time

* softlockup threshold to be reduced without increasing hung tasks
  poll frequency (hung task check is expensive relative to softlock watchdog)

* hung task check to be zero over-head when disabled at run-time

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile     |   1 +
 kernel/hung_task.c  | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/softlockup.c | 100 --------------------------
 kernel/sysctl.c     |  15 +++-
 4 files changed, 213 insertions(+), 101 deletions(-)
 create mode 100644 kernel/hung_task.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4c..979745f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 0000000..ba5a77c
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,198 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (!sysctl_hung_task_warnings)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+
+	if (sysctl_hung_task_panic)
+		panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(void)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp();
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if (test_taint(TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			goto unlock;
+		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+		if (t->state == TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+ unlock:
+	read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+	/* timeout of 0 will disable the watchdog */
+	if (sysctl_hung_task_timeout_secs == 0)
+		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+	else
+		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+				  struct file *filp, void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto out;
+
+	update_poll_jiffies();
+
+	wake_up_process(watchdog_task);
+
+ out:
+	return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+	set_user_nice(current, 0);
+	update_poll_jiffies();
+
+	for ( ; ; ) {
+		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		check_hung_uninterruptible_tasks();
+	}
+
+	return 0;
+}
+
+static int __init hung_task_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+	return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a24..88796c3 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -166,97 +166,11 @@ void softlockup_tick(void)
 }
 
 /*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
-	unsigned long switch_count = t->nvcsw + t->nivcsw;
-
-	if (t->flags & PF_FROZEN)
-		return;
-
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
-		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
-		return;
-	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
-		return;
-	if (!sysctl_hung_task_warnings)
-		return;
-	sysctl_hung_task_warnings--;
-
-	/*
-	 * Ok, the task did not get scheduled for more than 2 minutes,
-	 * complain:
-	 */
-	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
-	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
-			" disables this message.\n");
-	sched_show_task(t);
-	__debug_show_held_locks(t);
-
-	t->last_switch_timestamp = now;
-	touch_nmi_watchdog();
-
-	if (softlockup_panic)
-		panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
-	int max_count = sysctl_hung_task_check_count;
-	unsigned long now = get_timestamp(this_cpu);
-	struct task_struct *g, *t;
-
-	/*
-	 * If the system crashed already then all bets are off,
-	 * do not report extra hung tasks:
-	 */
-	if (test_taint(TAINT_DIE) || did_panic)
-		return;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
-		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
-	} while_each_thread(g, t);
- unlock:
-	read_unlock(&tasklist_lock);
-}
-
-/*
  * The watchdog thread - runs every second and touches the timestamp.
  */
 static int watchdog(void *__bind_cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-	int this_cpu = (long)__bind_cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
 		if (kthread_should_stop())
 			break;
 
-		if (this_cpu == check_cpu) {
-			if (sysctl_hung_task_timeout_secs)
-				check_hung_uninterruptible_tasks(this_cpu);
-		}
-
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		check_cpu = cpumask_any(cpu_online_mask);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		if (hotcpu == check_cpu) {
-			/* Pick any other online cpu. */
-			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
-		}
-		break;
-
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 596dc31..2481ed3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -805,6 +805,19 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_panic",
+		.data		= &sysctl_hung_task_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
@@ -820,7 +833,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dohung_task_timeout_secs,
 		.strategy	= &sysctl_intvec,
 	},
 	{
-- 
cgit v1.1


From af432eb1cc3178ec7109aca2283aafb1c12ccac1 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 16 Jan 2009 09:09:38 -0800
Subject: softlockup: fix to allow compiling with !DETECT_HUNG_TASK

Fixes the following compile error:

 kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count'
 kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp'

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f12..fb94442 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1041,7 +1041,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
 	p->last_switch_count = 0;
 	p->last_switch_timestamp = 0;
 #endif
-- 
cgit v1.1


From 603a148f434742fff08273207ffa44176cad13a1 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Sat, 17 Jan 2009 10:31:48 -0800
Subject: softlockup: fix potential race in hung_task when resetting timeout

Impact: fix potential false panic

A potential race exists if sysctl_hung_task_timeout_secs is reset to 0
while inside check_hung_uniterruptible_tasks(). If check_task() is
entered, a comparison with 0 will result in a false hung_task being
detected.

If sysctl_hung_task_panic is set, the system will panic.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba5a77c..ba8ccd4 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,7 +72,8 @@ static unsigned long get_timestamp(void)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-static void check_hung_task(struct task_struct *t, unsigned long now)
+static void check_hung_task(struct task_struct *t, unsigned long now,
+			    unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
@@ -84,8 +85,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
+	if ((long)(now - t->last_switch_timestamp) < timeout)
 		return;
 	if (!sysctl_hung_task_warnings)
 		return;
@@ -96,8 +96,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 	 * complain:
 	 */
 	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
+			"%ld seconds.\n", t->comm, t->pid, timeout);
 	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
 			" disables this message.\n");
 	sched_show_task(t);
@@ -115,7 +114,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
  * a really long time (120 seconds). If that happens, print out
  * a warning.
  */
-static void check_hung_uninterruptible_tasks(void)
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	unsigned long now = get_timestamp();
@@ -134,7 +133,7 @@ static void check_hung_uninterruptible_tasks(void)
 			goto unlock;
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
+			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
 	read_unlock(&tasklist_lock);
@@ -180,8 +179,17 @@ static int watchdog(void *dummy)
 	update_poll_jiffies();
 
 	for ( ; ; ) {
+		unsigned long timeout;
+
 		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
-		check_hung_uninterruptible_tasks();
+
+		/*
+		 * Need to cache timeout here to avoid timeout being set
+		 * to 0 via sysctl while inside check_hung_*_tasks().
+		 */
+		timeout = sysctl_hung_task_timeout_secs;
+		if (timeout)
+			check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.1


From 5c5317de147e9b38ea9c4cbdc2d15bed7648d036 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:26:53 +0100
Subject: x86, ftrace, hw-branch-tracer: support hotplug cpus

Support hotplug cpus.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 123 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 107 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index df21c1e..3981953 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,7 +1,8 @@
 /*
  * h/w branch tracer for x86 based on bts
  *
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ * Copyright (C) 2008-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  *
  */
 
@@ -10,6 +11,9 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
 
 #include <asm/ds.h>
 
@@ -19,13 +23,31 @@
 
 #define SIZEOF_BTS (1 << 13)
 
+/* The tracer mutex protects the below per-cpu tracer array.
+   It needs to be held to:
+   - start tracing on all cpus
+   - stop tracing on all cpus
+   - start tracing on a single hotplug cpu
+   - stop tracing on a single hotplug cpu
+   - read the trace from all cpus
+   - read the trace from a single cpu
+*/
+static DEFINE_MUTEX(bts_tracer_mutex);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
+static int __read_mostly trace_hw_branches_enabled;
 
+
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
 static void bts_trace_start_cpu(void *arg)
 {
 	if (this_tracer)
@@ -43,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
 
-	tracing_reset_online_cpus(tr);
+	on_each_cpu(bts_trace_start_cpu, NULL, 1);
+	trace_hw_branches_enabled = 1;
 
-	for_each_cpu(cpu, cpu_possible_mask)
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+	mutex_unlock(&bts_tracer_mutex);
 }
 
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
 static void bts_trace_stop_cpu(void *arg)
 {
 	if (this_tracer) {
@@ -61,20 +89,58 @@ static void bts_trace_stop_cpu(void *arg)
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_hw_branches_enabled = 0;
+	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
 
-	for_each_cpu(cpu, cpu_possible_mask)
+	mutex_unlock(&bts_tracer_mutex);
+}
+
+static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
+				     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	mutex_lock(&bts_tracer_mutex);
+
+	if (!trace_hw_branches_enabled)
+		goto out;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		break;
+	case CPU_DOWN_PREPARE:
 		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+		break;
+	}
+
+ out:
+	mutex_unlock(&bts_tracer_mutex);
+	return NOTIFY_DONE;
 }
 
+static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
+	.notifier_call = bts_hotcpu_handler
+};
+
 static int bts_trace_init(struct trace_array *tr)
 {
+	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
 }
 
+static void bts_trace_reset(struct trace_array *tr)
+{
+	bts_trace_stop(tr);
+	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
+}
+
 static void bts_trace_print_header(struct seq_file *m)
 {
 	seq_puts(m,
@@ -108,18 +174,34 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 {
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq;
+	unsigned long irq1, irq2;
+	int cpu;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
-	if (!event)
+	if (unlikely(!tr))
+		return;
+
+	if (unlikely(!trace_hw_branches_enabled))
 		return;
+
+	local_irq_save(irq1);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	if (!event)
+		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, from);
 	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->ent.cpu = smp_processor_id();
+	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq);
+	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(irq1);
 }
 
 static void trace_bts_at(struct trace_array *tr,
@@ -143,6 +225,11 @@ static void trace_bts_at(struct trace_array *tr,
 	}
 }
 
+/*
+ * Collect the trace on the current cpu and write it into the ftrace buffer.
+ *
+ * pre: bts_tracer_mutex must be locked
+ */
 static void trace_bts_cpu(void *arg)
 {
 	struct trace_array *tr = (struct trace_array *) arg;
@@ -152,6 +239,9 @@ static void trace_bts_cpu(void *arg)
 	if (!this_tracer)
 		return;
 
+	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -171,17 +261,18 @@ out:
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
+
+	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	for_each_cpu(cpu, cpu_possible_mask)
-		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+	mutex_unlock(&bts_tracer_mutex);
 }
 
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
 	.init		= bts_trace_init,
-	.reset		= bts_trace_stop,
+	.reset		= bts_trace_reset,
 	.print_header	= bts_trace_print_header,
 	.print_line	= bts_trace_print_line,
 	.start		= bts_trace_start,
-- 
cgit v1.1


From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:31:01 +0100
Subject: x86, ftrace, hw-branch-tracer: dump trace on oops

Dump the branch trace on an oops (based on ftrace_dump_on_oops).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h             |  1 -
 kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++-------
 2 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54b7278..b96037d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3981953..e56df2c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int __read_mostly trace_hw_branches_enabled;
+static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
@@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 
 static int bts_trace_init(struct trace_array *tr)
 {
+	hw_branch_trace = tr;
+
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
@@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	return TRACE_TYPE_UNHANDLED;
 }
 
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
 {
+	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
 	unsigned long irq1, irq2;
@@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 	local_irq_restore(irq1);
 }
 
-static void trace_bts_at(struct trace_array *tr,
-			 const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
 {
 	struct bts_struct bts;
 	int err = 0;
@@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr,
 
 	switch (bts.qualifier) {
 	case BTS_BRANCH:
-		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
 		break;
 	}
 }
@@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg)
 	const struct bts_trace *trace;
 	unsigned char *at;
 
-	if (!this_tracer)
+	if (unlikely(!tr))
 		return;
 
 	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
 		return;
 
+	if (unlikely(!this_tracer))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg)
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 out:
 	ds_resume_bts(this_tracer);
@@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+void trace_hw_branch_oops(void)
+{
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_bts_cpu(hw_branch_trace);
+
+	mutex_unlock(&bts_tracer_mutex);
+}
+
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
-- 
cgit v1.1


From e23b8ad83430a6fdfbdbfac365f5b0312dd57f10 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:33:31 +0100
Subject: x86, ftrace, hw-branch-tracer: reset trace buffer on close

Reset the ftrace buffer on close. Since we use cyclic buffers, the
trace is not contiguous, anyway.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e56df2c..372b47a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -274,6 +274,11 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+static void trace_bts_close(struct trace_iterator *iter)
+{
+	tracing_reset_online_cpus(iter->tr);
+}
+
 void trace_hw_branch_oops(void)
 {
 	mutex_lock(&bts_tracer_mutex);
@@ -292,7 +297,8 @@ struct tracer bts_tracer __read_mostly =
 	.print_line	= bts_trace_print_line,
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare
+	.open		= trace_bts_prepare,
+	.close		= trace_bts_close
 };
 
 __init static int init_bts_trace(void)
-- 
cgit v1.1


From 11edda06289d412d13ff7c672bd72e043f637e74 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:29:16 +0100
Subject: x86, ftrace, hw-branch-tracer: change trace format

Change the hw-branch-tracer format to be more readable.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 372b47a..fff3545 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -146,10 +146,7 @@ static void bts_trace_reset(struct trace_array *tr)
 
 static void bts_trace_print_header(struct seq_file *m)
 {
-	seq_puts(m,
-		 "# CPU#        FROM                   TO         FUNCTION\n");
-	seq_puts(m,
-		 "#  |           |                     |             |\n");
+	seq_puts(m, "# CPU#        TO  <-  FROM\n");
 }
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -157,15 +154,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
 		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
-		    trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
-				     it->from, it->to) &&
-		    (!it->from ||
-		     seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+		    seq_print_ip_sym(seq, it->to, symflags) &&
+		    trace_seq_printf(seq, "\t  <-  ") &&
+		    seq_print_ip_sym(seq, it->from, symflags) &&
 		    trace_seq_printf(seq, "\n"))
 			return TRACE_TYPE_HANDLED;
 		return TRACE_TYPE_PARTIAL_LINE;;
-- 
cgit v1.1


From 3690b5e6fd9daa030039ae9bda69044228bd476d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 16 Jan 2009 16:32:25 +0800
Subject: trace_workqueue: use percpu data for workqueue stat

Impact: use percpu data instead of a global structure

Use:

   static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);

instead of allocating a global structure.

percpu data also works well on NUMA.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 64 +++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index f8118d3..4664990 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -8,6 +8,7 @@
 
 #include <trace/workqueue.h>
 #include <linux/list.h>
+#include <linux/percpu.h>
 #include "trace_stat.h"
 #include "trace.h"
 
@@ -37,7 +38,8 @@ struct workqueue_global_stats {
 /* Don't need a global lock because allocated before the workqueues, and
  * never freed.
  */
-static struct workqueue_global_stats *all_workqueue_stat;
+static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
+#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
 
 /* Insertion of a work */
 static void
@@ -48,8 +50,8 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			atomic_inc(&node->inserted);
@@ -58,7 +60,7 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 	}
 	pr_debug("trace_workqueue: entry not found\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Execution of a work */
@@ -70,8 +72,8 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			node->executed++;
@@ -80,7 +82,7 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 	}
 	pr_debug("trace_workqueue: entry not found\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Creation of a cpu workqueue thread */
@@ -104,11 +106,11 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 
 	cws->pid = wq_thread->pid;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (list_empty(&all_workqueue_stat[cpu].list))
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (list_empty(&workqueue_cpu_stat(cpu)->list))
 		cws->first_entry = true;
-	list_add_tail(&cws->list, &all_workqueue_stat[cpu].list);
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Destruction of a cpu workqueue thread */
@@ -119,8 +121,8 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			list_del(&node->list);
@@ -131,7 +133,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
 
 	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 }
 
@@ -141,13 +143,13 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 	struct cpu_workqueue_stats *ret = NULL;
 
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 
-	if (!list_empty(&all_workqueue_stat[cpu].list))
-		ret = list_entry(all_workqueue_stat[cpu].list.next,
+	if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+		ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
 				 struct cpu_workqueue_stats, list);
 
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return ret;
 }
@@ -172,9 +174,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 	unsigned long flags;
 	void *ret = NULL;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) {
-		spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
+		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
 			ret = workqueue_stat_start_cpu(cpu);
 			if (ret)
@@ -182,7 +184,7 @@ static void *workqueue_stat_next(void *prev, int idx)
 		}
 		return NULL;
 	}
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
 			  list);
@@ -199,10 +201,10 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 		   cws->executed,
 		   trace_find_cmdline(cws->pid));
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (&cws->list == all_workqueue_stat[cpu].list.next)
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
 		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return 0;
 }
@@ -258,17 +260,9 @@ int __init trace_workqueue_early_init(void)
 	if (ret)
 		goto no_creation;
 
-	all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats)
-				     * num_possible_cpus(), GFP_KERNEL);
-
-	if (!all_workqueue_stat) {
-		pr_warning("trace_workqueue: not enough memory\n");
-		goto no_creation;
-	}
-
 	for_each_possible_cpu(cpu) {
-		spin_lock_init(&all_workqueue_stat[cpu].lock);
-		INIT_LIST_HEAD(&all_workqueue_stat[cpu].list);
+		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
 	}
 
 	return 0;
-- 
cgit v1.1


From 5bc4564b224c3d9fe6dddafa25f56059bd978231 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 14:36:52 -0500
Subject: trace: do not disable wake up tracer on output of trace

Impact: fix to erased trace output

To try not to have the outputing of a trace interfere with the wakeup
tracer, it would disable tracing while the output was printing. But
if a trace had started when it was disabled, it can show a partial
trace. To try to solve this, on closing of the tracer, it would
clear the trace buffer.

The latency tracers (wakeup and irqsoff) have two buffers. One for
recording and one for holding the max trace that is printed. The
clearing of the trace above should only affect the recording buffer.
But for some reason it would move the erased trace to the print
buffer. Probably due to a race with the closing of the trace and
the saving ofhe max race.

The above is all pretty useless, and if the user does not want the
printing of the trace to be traced itself, then the user can manual
disable tracing. This patch removes all the code that tries to keep
the output of the tracer from modifying the trace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 42ae1e7..e27adef 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -262,12 +262,6 @@ out:
 	atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
 
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
 static void start_wakeup_tracer(struct trace_array *tr)
 {
 	int ret;
@@ -306,13 +300,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
 	register_ftrace_function(&trace_ops);
 
-	if (tracing_is_enabled()) {
+	if (tracing_is_enabled())
 		tracer_enabled = 1;
-		save_tracer_enabled = 1;
-	} else {
+	else
 		tracer_enabled = 0;
-		save_tracer_enabled = 0;
-	}
 
 	return;
 fail_deprobe_wake_new:
@@ -324,7 +315,6 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 	unregister_trace_sched_switch(probe_wakeup_sched_switch);
 	unregister_trace_sched_wakeup_new(probe_wakeup);
@@ -350,28 +340,11 @@ static void wakeup_tracer_start(struct trace_array *tr)
 {
 	wakeup_reset(tr);
 	tracer_enabled = 1;
-	save_tracer_enabled = 1;
 }
 
 static void wakeup_tracer_stop(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
-}
-
-static void wakeup_tracer_open(struct trace_iterator *iter)
-{
-	/* stop the trace while dumping */
-	tracer_enabled = 0;
-}
-
-static void wakeup_tracer_close(struct trace_iterator *iter)
-{
-	/* forget about any processes we were recording */
-	if (save_tracer_enabled) {
-		wakeup_reset(iter->tr);
-		tracer_enabled = 1;
-	}
 }
 
 static struct tracer wakeup_tracer __read_mostly =
@@ -381,8 +354,6 @@ static struct tracer wakeup_tracer __read_mostly =
 	.reset		= wakeup_tracer_reset,
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
-	.open		= wakeup_tracer_open,
-	.close		= wakeup_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
-- 
cgit v1.1


From 97b17efe4537e11bf6669106cfe4ee2c5331b267 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 15:24:56 -0500
Subject: ring-buffer: do not swap if recording is disabled

If the ring buffer recording has been disabled. Do not let
swapping of ring buffers occur. Simply return -EAGAIN.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 15 +++++++++++++++
 kernel/trace/trace.c       |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0b9de5a..890020e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2266,9 +2266,24 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	if (buffer_a->pages != buffer_b->pages)
 		return -EINVAL;
 
+	if (ring_buffer_flags != RB_BUFFERS_ON)
+		return -EAGAIN;
+
+	if (atomic_read(&buffer_a->record_disabled))
+		return -EAGAIN;
+
+	if (atomic_read(&buffer_b->record_disabled))
+		return -EAGAIN;
+
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
+	if (atomic_read(&cpu_buffer_a->record_disabled))
+		return -EAGAIN;
+
+	if (atomic_read(&cpu_buffer_b->record_disabled))
+		return -EAGAIN;
+
 	/*
 	 * We can't do a synchronize_sched here because this
 	 * function can be called in atomic context.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 220c264..757ae6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -415,7 +415,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	ftrace_enable_cpu();
 
-	WARN_ON_ONCE(ret);
+	WARN_ON_ONCE(ret && ret != -EAGAIN);
 
 	__update_max_tr(tr, tsk, cpu);
 	__raw_spin_unlock(&ftrace_max_lock);
-- 
cgit v1.1


From 3244351c31211a8b1ba8b4b34c3de04d5dfa03e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 16:24:46 -0500
Subject: trace: separate out rt tasks from wakeup tracer

Impact: add option to trace all tasks or just RT tasks

The current wakeup tracer only traces RT task wakeups. This is
fine for those interested in wake up timings of RT tasks, but
it is useless for those that are interested in the causes
of long wakeups for non RT tasks.

This patch creates a "wakeup_rt" to implement the tracing of just
RT tasks (as the current "wakeup" does). And makes "wakeup" now
trace all tasks as an average developer would expect.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e27adef..f489578 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,6 +25,7 @@ static int __read_mostly	tracer_enabled;
 static struct task_struct	*wakeup_task;
 static int			wakeup_cpu;
 static unsigned			wakeup_prio = -1;
+static int			wakeup_rt;
 
 static raw_spinlock_t wakeup_lock =
 	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -224,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	tracing_record_cmdline(p);
 	tracing_record_cmdline(current);
 
-	if (likely(!rt_task(p)) ||
+	if ((wakeup_rt && !rt_task(p)) ||
 			p->prio >= wakeup_prio ||
 			p->prio >= current->prio)
 		return;
@@ -321,7 +322,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 	unregister_trace_sched_wakeup(probe_wakeup);
 }
 
-static int wakeup_tracer_init(struct trace_array *tr)
+static int __wakeup_tracer_init(struct trace_array *tr)
 {
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
@@ -329,6 +330,18 @@ static int wakeup_tracer_init(struct trace_array *tr)
 	return 0;
 }
 
+static int wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_rt = 0;
+	return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_rt_tracer_init(struct trace_array *tr)
+{
+	wakeup_rt = 1;
+	return __wakeup_tracer_init(tr);
+}
+
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
 	stop_wakeup_tracer(tr);
@@ -360,6 +373,19 @@ static struct tracer wakeup_tracer __read_mostly =
 #endif
 };
 
+static struct tracer wakeup_rt_tracer __read_mostly =
+{
+	.name		= "wakeup_rt",
+	.init		= wakeup_rt_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.start		= wakeup_tracer_start,
+	.stop		= wakeup_tracer_stop,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
 __init static int init_wakeup_tracer(void)
 {
 	int ret;
@@ -368,6 +394,10 @@ __init static int init_wakeup_tracer(void)
 	if (ret)
 		return ret;
 
+	ret = register_tracer(&wakeup_rt_tracer);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 device_initcall(init_wakeup_tracer);
-- 
cgit v1.1


From f8ec1062f589cdb1cffcffab1376124a1bc08500 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 17:17:04 -0500
Subject: wakeup-tracer: show scheduling data in output

Impact: better data for wakeup tracer

This patch adds the wakeup and schedule calls that are used by
the scheduler tracer to make the wakeup tracer more readable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f489578..93cecda 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -153,6 +153,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 		goto out_unlock;
 
 	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -214,6 +215,7 @@ static void wakeup_reset(struct trace_array *tr)
 static void
 probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 {
+	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	long disabled;
@@ -253,9 +255,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 
 	local_save_flags(flags);
 
-	wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
-	trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
-		       CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	data = wakeup_trace->data[wakeup_cpu];
+	data->preempt_timestamp = ftrace_now(cpu);
+	tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
+				   flags, pc);
+	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
+		       flags, pc);
 
 out_locked:
 	__raw_spin_unlock(&wakeup_lock);
-- 
cgit v1.1


From 69507c06539332e6e49f83aa478844130233bece Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 18:45:57 -0500
Subject: ring-buffer: reset timestamps when ring buffer is reset

Impact: fix bad times of recent resets

The ring buffer needs to reset its timestamps when reseting of the
buffer, otherwise the timestamps are stale and might be used to
calculate times in the buffer causing funny timestamps to appear.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 890020e..7839280 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2166,6 +2166,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
+
+	cpu_buffer->write_stamp = 0;
+	cpu_buffer->read_stamp = 0;
 }
 
 /**
-- 
cgit v1.1


From 94523e818f283d3c69f621406f633afff46dbf82 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 11:18:06 -0500
Subject: trace: remove internal irqsoff disabling for trace output

Impact: cleanup of duplicate features

The trace output disables the ring buffer and prevents tracing to
occur. The code in irqsoff to do the same thing is no longer needed.
This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 34 ++--------------------------------
 1 file changed, 2 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 62a78d9..ed344b0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,28 +353,18 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
 	register_ftrace_function(&trace_ops);
-	if (tracing_is_enabled()) {
+	if (tracing_is_enabled())
 		tracer_enabled = 1;
-		save_tracer_enabled = 1;
-	} else {
+	else
 		tracer_enabled = 0;
-		save_tracer_enabled = 0;
-	}
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 }
 
@@ -395,25 +385,11 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
 	tracer_enabled = 1;
-	save_tracer_enabled = 1;
 }
 
 static void irqsoff_tracer_stop(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_open(struct trace_iterator *iter)
-{
-	/* stop the trace while dumping */
-	tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_close(struct trace_iterator *iter)
-{
-	/* restart tracing */
-	tracer_enabled = save_tracer_enabled;
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
@@ -431,8 +407,6 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
@@ -459,8 +433,6 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
@@ -489,8 +461,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
-- 
cgit v1.1


From b06a830183b610c0a88c29a92feb7991a867ab46 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 14:26:15 -0500
Subject: trace: fix logic to start/stop counting

The logic in the tracing_start/stop code prevents the WARN_ON
from ever detecting if a start/stop pair was mismatched.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 757ae6f..2129ab9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -610,13 +610,12 @@ void tracing_start(void)
 		return;
 
 	spin_lock_irqsave(&tracing_start_lock, flags);
-	if (--trace_stop_count)
-		goto out;
-
-	if (trace_stop_count < 0) {
-		/* Someone screwed up their debugging */
-		WARN_ON_ONCE(1);
-		trace_stop_count = 0;
+	if (--trace_stop_count) {
+		if (trace_stop_count < 0) {
+			/* Someone screwed up their debugging */
+			WARN_ON_ONCE(1);
+			trace_stop_count = 0;
+		}
 		goto out;
 	}
 
-- 
cgit v1.1


From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 19:01:40 -0500
Subject: trace, lockdep: manual preempt count adding for local_bh_disable

Impact: fix to preempt trace triggering lockdep check_flag failure

In local_bh_disable, the use of add_preempt_count causes the
preempt tracer to start recording the time preemption is off.
But because it already modified the preempt_count to show
softirqs disabled, and before it called the lockdep code to
handle this, it causes a state that lockdep can not handle.

The preempt tracer will reset the ring buffer on start of a trace,
and the ring buffer reset code does a spin_lock_irqsave. This
calls into lockdep and lockdep will fail when it detects the
invalid state of having softirqs disabled but the internal
current->softirqs_enabled is still set.

The fix is to manually add the SOFTIRQ_OFFSET to preempt count
and call the preempt tracer code outside the lockdep critical
area.

Thanks to Peter Zijlstra for suggesting this solution.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c   |  8 ++++----
 kernel/softirq.c | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c..c154825 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4399,10 +4399,7 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
@@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
 	return addr;
 }
 
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de..6edfc2c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 
@@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip)
 	WARN_ON_ONCE(in_irq());
 
 	raw_local_irq_save(flags);
-	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * The preempt tracer hooks into add_preempt_count and will break
+	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+	 * is set and before current->softirq_enabled is cleared.
+	 * We must manually increment preempt_count here and manually
+	 * call the trace_preempt_off later.
+	 */
+	preempt_count() += SOFTIRQ_OFFSET;
 	/*
 	 * Were softirqs turned off above:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
+
+	if (preempt_count() == SOFTIRQ_OFFSET)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip)
-- 
cgit v1.1


From 9005f3ebebfcfe9ccd731d16c468907a35ac1f9a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 22 Jan 2009 17:04:53 -0800
Subject: tracing/function-graph-tracer: various fixes and features

This patch brings various bugfixes:

- Drop the first irrelevant task switch on the very beginning of a trace.

- Drop the OVERHEAD word from the headers, the DURATION word is sufficient
  and will not overlap other columns.

- Make the headers fit well their respective columns whatever the
  selected options.

Ie, default options:

 # tracer: function_graph
 #
 # CPU  DURATION                  FUNCTION CALLS
 # |     |   |                     |   |   |   |

  1)   0.646 us    |                    }
  1)               |                    mem_cgroup_del_lru_list() {
  1)   0.624 us    |                      lookup_page_cgroup();
  1)   1.970 us    |                    }

 echo funcgraph-proc > trace_options

 # tracer: function_graph
 #
 # CPU  TASK/PID        DURATION                  FUNCTION CALLS
 # |    |    |           |   |                     |   |   |   |

  0)   bash-2937    |   0.895 us    |                }
  0)   bash-2937    |   0.888 us    |                __rcu_read_unlock();
  0)   bash-2937    |   0.864 us    |                conv_uni_to_pc();
  0)   bash-2937    |   1.015 us    |                __rcu_read_lock();

 echo nofuncgraph-cpu > trace_options
 echo nofuncgraph-proc > trace_options

 # tracer: function_graph
 #
 #   DURATION                  FUNCTION CALLS
 #    |   |                     |   |   |   |

   3.752 us    |                  native_pud_val();
   0.616 us    |                  native_pud_val();
   0.624 us    |                  native_pmd_val();

About features, one can now disable the duration (this will hide the
overhead too for convenient reasons and because on  doesn't need
overhead if it hasn't the duration):

 echo nofuncgraph-duration > trace_options

 # tracer: function_graph
 #
 #                FUNCTION CALLS
 #                |   |   |   |

           cap_vm_enough_memory() {
             __vm_enough_memory() {
               vm_acct_memory();
             }
           }
         }

And at last, an option to print the absolute time:

 //Restart from default options
 echo funcgraph-abstime > trace_options

 # tracer: function_graph
 #
 #      TIME       CPU  DURATION                  FUNCTION CALLS
 #       |         |     |   |                     |   |   |   |

   261.339774 |   1) + 42.823 us   |    }
   261.339775 |   1)   1.045 us    |    _spin_lock_irq();
   261.339777 |   1)   0.940 us    |    _spin_lock_irqsave();
   261.339778 |   1)   0.752 us    |    _spin_unlock_irqrestore();
   261.339780 |   1)   0.857 us    |    _spin_unlock_irq();
   261.339782 |   1)               |    flush_to_ldisc() {
   261.339783 |   1)               |      tty_ldisc_ref() {
   261.339783 |   1)               |        tty_ldisc_try() {
   261.339784 |   1)   1.075 us    |          _spin_lock_irqsave();
   261.339786 |   1)   0.842 us    |          _spin_unlock_irqrestore();
   261.339788 |   1)   4.211 us    |        }
   261.339788 |   1)   5.662 us    |      }

The format is seconds.usecs.

I guess no one needs the nanosec precision here, the main goal is to have
an overview about the general timings of events, and to see the place when
the trace switches from one cpu to another.

ie:

   274.874760 |   1)   0.676 us    |      _spin_unlock();
   274.874762 |   1)   0.609 us    |      native_load_sp0();
   274.874763 |   1)   0.602 us    |      native_load_tls();
   274.878739 |   0)   0.722 us    |                  }
   274.878740 |   0)   0.714 us    |                  native_pmd_val();
   274.878741 |   0)   0.730 us    |                  native_pmd_val();

Here there is a 4000 usecs difference when we switch the cpu.

Changes in V2:

- Completely fix the first pointless task switch.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 298 +++++++++++++++++++++++------------
 1 file changed, 195 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3c54598..66fc7b8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
 /*
  *
  * Function graph tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  * Mostly borrowed from function tracer which
  * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
  *
@@ -21,9 +21,11 @@
 #define TRACE_GRAPH_PRINT_CPU		0x2
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
 #define TRACE_GRAPH_PRINT_PROC		0x8
+#define TRACE_GRAPH_PRINT_DURATION	0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME	0X20
 
 static struct tracer_opt trace_opts[] = {
-	/* Display overruns ? */
+	/* Display overruns? (for self-debug purpose) */
 	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
 	/* Display CPU ? */
 	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -31,17 +33,22 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
 	/* Display proc name/pid */
 	{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
+	/* Display duration of execution */
+	{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
+	/* Display absolute time of an entry */
+	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
 	/* Don't display overruns and proc by default */
-	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
+	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
+	       TRACE_GRAPH_PRINT_DURATION,
 	.opts = trace_opts
 };
 
 /* pid on the last trace processed */
-static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -154,17 +161,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 {
 	pid_t prev_pid;
+	pid_t *last_pid;
 	int ret;
 
-	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
+	if (!last_pids_cpu)
+		return TRACE_TYPE_HANDLED;
+
+	last_pid = per_cpu_ptr(last_pids_cpu, cpu);
+
+	if (*last_pid == pid)
 		return TRACE_TYPE_HANDLED;
 
-	prev_pid = last_pid[cpu];
-	last_pid[cpu] = pid;
+	prev_pid = *last_pid;
+	*last_pid = pid;
 
+	if (prev_pid == -1)
+		return TRACE_TYPE_HANDLED;
 /*
  * Context-switch trace line:
 
@@ -232,9 +247,34 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	return true;
 }
 
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+	/* If duration disappear, we don't need anything */
+	if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+		return 1;
+
+	/* Non nested entry or return */
+	if (duration == -1)
+		return trace_seq_printf(s, "  ");
+
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		/* Duration exceeded 100 msecs */
+		if (duration > 100000ULL)
+			return trace_seq_printf(s, "! ");
+
+		/* Duration exceeded 10 msecs */
+		if (duration > 10000ULL)
+			return trace_seq_printf(s, "+ ");
+	}
+
+	return trace_seq_printf(s, "  ");
+}
+
 static enum print_line_t
 print_graph_irq(struct trace_seq *s, unsigned long addr,
-				enum trace_type type, int cpu, pid_t pid)
+		enum trace_type type, int cpu, pid_t pid)
 {
 	int ret;
 
@@ -242,35 +282,40 @@ print_graph_irq(struct trace_seq *s, unsigned long addr,
 		addr >= (unsigned long)__irqentry_text_end)
 		return TRACE_TYPE_UNHANDLED;
 
-	if (type == TRACE_GRAPH_ENT) {
-		ret = trace_seq_printf(s, "==========> |  ");
-	} else {
-		/* Cpu */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-			ret = print_graph_cpu(s, cpu);
-			if (ret == TRACE_TYPE_PARTIAL_LINE)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		/* Proc */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-			ret = print_graph_proc(s, pid);
-			if (ret == TRACE_TYPE_PARTIAL_LINE)
-				return TRACE_TYPE_PARTIAL_LINE;
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_printf(s, " | ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
-			ret = trace_seq_printf(s, " | ");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	/* No overhead */
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		/* No overhead */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-			ret = trace_seq_printf(s, "  ");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	if (type == TRACE_GRAPH_ENT)
+		ret = trace_seq_printf(s, "==========>");
+	else
+		ret = trace_seq_printf(s, "<==========");
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Don't close the duration column if haven't one */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		trace_seq_printf(s, " |");
+	ret = trace_seq_printf(s, "\n");
 
-		ret = trace_seq_printf(s, "<========== |\n");
-	}
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 	return TRACE_TYPE_HANDLED;
@@ -289,7 +334,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 	sprintf(msecs_str, "%lu", (unsigned long) duration);
 
 	/* Print msecs */
-	ret = trace_seq_printf(s, msecs_str);
+	ret = trace_seq_printf(s, "%s", msecs_str);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -322,19 +367,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 }
 
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
 {
-	/* Duration exceeded 100 msecs */
-	if (duration > 100000ULL)
-		return trace_seq_printf(s, "! ");
+	unsigned long usecs_rem;
 
-	/* Duration exceeded 10 msecs */
-	if (duration > 10000ULL)
-		return trace_seq_printf(s, "+ ");
+	usecs_rem = do_div(t, 1000000000);
+	usecs_rem /= 1000;
 
-	return trace_seq_printf(s, "  ");
+	return trace_seq_printf(s, "%5lu.%06lu |  ",
+			(unsigned long)t, usecs_rem);
 }
 
 /* Case of a leaf function on its call entry */
@@ -357,16 +398,16 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	duration = graph_ret->rettime - graph_ret->calltime;
 
 	/* Overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = print_graph_overhead(duration, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	ret = print_graph_duration(duration, s);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = print_graph_duration(duration, s);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -395,25 +436,17 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 
 	/* No overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = trace_seq_printf(s, "  ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Interrupt */
-	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
-	if (ret == TRACE_TYPE_UNHANDLED) {
-		/* No time */
+	/* No time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-	} else {
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
@@ -434,15 +467,30 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter, int cpu)
+			struct trace_iterator *iter)
 {
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
+	struct ftrace_graph_ent *call = &field->graph_ent;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Interrupt */
+	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -470,16 +518,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent, int cpu)
+		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_pid = iter->private;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -499,16 +556,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	}
 
 	/* Overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = print_graph_overhead(duration, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	ret = print_graph_duration(duration, s);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = print_graph_duration(duration, s);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Closing brace */
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -542,14 +599,23 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 {
 	int i;
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_pid = iter->private;
+
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, iter->cpu);
+		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
@@ -566,17 +632,17 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	}
 
 	/* No overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = trace_seq_printf(s, "  ");
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* No time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* No time */
-	ret = trace_seq_printf(s, "            |  ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	/* Indentation */
 	if (trace->depth > 0)
 		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
@@ -614,13 +680,12 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(field, s, iter,
-					 iter->cpu);
+		return print_graph_entry(field, s, iter);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry, iter->cpu);
+		return print_graph_return(&field->ret, s, entry, iter);
 	}
 	case TRACE_PRINT: {
 		struct print_entry *field;
@@ -636,28 +701,55 @@ static void print_graph_headers(struct seq_file *s)
 {
 	/* 1st line */
 	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		seq_printf(s, "     TIME       ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "CPU ");
+		seq_printf(s, "CPU");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "TASK/PID     ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
-		seq_printf(s, "OVERHEAD/");
-	seq_printf(s, "DURATION            FUNCTION CALLS\n");
+		seq_printf(s, "  TASK/PID      ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		seq_printf(s, "  DURATION   ");
+	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
 	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		seq_printf(s, "      |         ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "|   ");
+		seq_printf(s, "|  ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "|      |     ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		seq_printf(s, "|        ");
-		seq_printf(s, "|                   |   |   |   |\n");
-	} else
-		seq_printf(s, "    |               |   |   |   |\n");
+		seq_printf(s, "  |    |        ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		seq_printf(s, "   |   |      ");
+	seq_printf(s, "               |   |   |   |\n");
 }
+
+static void graph_trace_open(struct trace_iterator *iter)
+{
+	/* pid on the last trace processed */
+	pid_t *last_pid = alloc_percpu(pid_t);
+	int cpu;
+
+	if (!last_pid)
+		pr_warning("function graph tracer: not enough memory\n");
+	else
+		for_each_possible_cpu(cpu) {
+			pid_t *pid = per_cpu_ptr(last_pid, cpu);
+			*pid = -1;
+		}
+
+	iter->private = last_pid;
+}
+
+static void graph_trace_close(struct trace_iterator *iter)
+{
+	percpu_free(iter->private);
+}
+
 static struct tracer graph_trace __read_mostly = {
 	.name	     	= "function_graph",
+	.open		= graph_trace_open,
+	.close		= graph_trace_close,
 	.init	     	= graph_trace_init,
 	.reset	     	= graph_trace_reset,
 	.print_line	= print_graph_function,
-- 
cgit v1.1


From cc2f6d90e950b69ad31d483c19cc1d121bb25c16 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 23 Jan 2009 13:03:37 -0800
Subject: kmemtrace: fix printk format warnings

Fix kmemtrace printk warnings:

  kernel/trace/kmemtrace.c:142: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t'
  kernel/trace/kmemtrace.c:147: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 7ebc58c..72b326b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.1


From 9011262a37cb438f0fa9394b5e83840db8f9680a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:23 -0200
Subject: ftrace: add ftrace_vprintk

Impact: new helper function

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2129ab9..2f8ac1f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2951,6 +2951,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(__ftrace_printk);
 
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-- 
cgit v1.1


From c71a896154119f4ca9e89d6078f5f63ad60ef199 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:27 -0200
Subject: blktrace: add ftrace plugin

Impact: New way of using the blktrace infrastructure

This drops the requirement of userspace utilities to use the blktrace
facility.

Configuration is done thru sysfs, adding a "trace" directory to the
partition directory where blktrace can be enabled for the associated
request_queue.

The same filters present in the IOCTL interface are present as sysfs
device attributes.

The /sys/block/sdX/sdXN/trace/enable file allows tracing without any
filters.

The other files in this directory: pid, act_mask, start_lba and end_lba
can be used with the same meaning as with the IOCTL interface.

Using the sysfs interface will only setup the request_queue->blk_trace
fields, tracing will only take place when the "blk" tracer is selected
via the ftrace interface, as in the following example:

To see the trace, one can use the /d/tracing/trace file or the
/d/tracign/trace_pipe file, with semantics defined in the ftrace
documentation in Documentation/ftrace.txt.

[root@f10-1 ~]# cat /t/trace
       kjournald-305   [000]  3046.491224:   8,1    A WBS 6367 + 8 <- (8,1) 6304
       kjournald-305   [000]  3046.491227:   8,1    Q   R 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491236:   8,1    G  RB 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491239:   8,1    P  NS [kjournald]
       kjournald-305   [000]  3046.491242:   8,1    I RBS 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491251:   8,1    D  WB 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491610:   8,1    U  WS [kjournald] 1
          <idle>-0     [000]  3046.511914:   8,1    C  RS 6367 + 8 [6367]
[root@f10-1 ~]#

The default line context (prefix) format is the one described in the ftrace
documentation, with the blktrace specific bits using its existing format,
described in blkparse(8).

If one wants to have the classic blktrace formatting, this is possible by
using:

[root@f10-1 ~]# echo blk_classic > /t/trace_options
[root@f10-1 ~]# cat /t/trace
  8,1    0  3046.491224   305  A WBS 6367 + 8 <- (8,1) 6304
  8,1    0  3046.491227   305  Q   R 6367 + 8 [kjournald]
  8,1    0  3046.491236   305  G  RB 6367 + 8 [kjournald]
  8,1    0  3046.491239   305  P  NS [kjournald]
  8,1    0  3046.491242   305  I RBS 6367 + 8 [kjournald]
  8,1    0  3046.491251   305  D  WB 6367 + 8 [kjournald]
  8,1    0  3046.491610   305  U  WS [kjournald] 1
  8,1    0  3046.511914     0  C  RS 6367 + 8 [6367]
[root@f10-1 ~]#

Using the ftrace standard format allows more flexibility, such
as the ability of asking for backtraces via trace_options:

[root@f10-1 ~]# echo noblk_classic > /t/trace_options
[root@f10-1 ~]# echo stacktrace > /t/trace_options

[root@f10-1 ~]# cat /t/trace
       kjournald-305   [000]  3318.826779:   8,1    A WBS 6375 + 8 <- (8,1) 6312
       kjournald-305   [000]  3318.826782:
 <= submit_bio
 <= submit_bh
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald
 <= kthread
 <= child_rip
       kjournald-305   [000]  3318.826836:   8,1    Q   R 6375 + 8 [kjournald]
       kjournald-305   [000]  3318.826837:
 <= generic_make_request
 <= submit_bio
 <= submit_bh
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald
 <= kthread

Please read the ftrace documentation to use aditional, standardized
tracing filters such as /d/tracing/trace_cpumask, etc.

See also /d/tracing/trace_mark to add comments in the trace stream,
that is equivalent to the /d/block/sdaN/msg interface.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b96037d..e603a29 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -32,6 +32,7 @@ enum trace_type {
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
+	TRACE_BLK,
 
 	__TRACE_LAST_TYPE,
 };
-- 
cgit v1.1


From f04109bf1be7449e27d38ae1bb8465013374bd49 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Jan 2009 13:02:12 -0200
Subject: trace: Use tracing_reset_online_cpus in more places
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c          | 6 +-----
 kernel/trace/trace_functions_graph.c | 8 ++------
 kernel/trace/trace_nop.c             | 6 +-----
 3 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ca017e0..1284145 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -133,11 +133,7 @@ static void stop_branch_trace(struct trace_array *tr)
 
 static int branch_trace_init(struct trace_array *tr)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
+	tracing_reset_online_cpus(tr);
 	start_branch_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66fc7b8..c97594d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,15 +52,11 @@ static struct tracer_flags tracer_flags = {
 
 static int graph_trace_init(struct trace_array *tr)
 {
-	int cpu, ret;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	ret = register_ftrace_graph(&trace_graph_return,
+	int ret = register_ftrace_graph(&trace_graph_return,
 					&trace_graph_entry);
 	if (ret)
 		return ret;
+	tracing_reset_online_cpus(tr);
 	tracing_start_cmdline_record();
 
 	return 0;
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767ac..087b6cb 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,8 @@ static void stop_nop_trace(struct trace_array *tr)
 
 static int nop_trace_init(struct trace_array *tr)
 {
-	int cpu;
 	ctx_trace = tr;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
+	tracing_reset_online_cpus(tr);
 	start_nop_trace(tr);
 	return 0;
 }
-- 
cgit v1.1


From b3a8c34886d0e3dd3a24a5b614ee025181da2f41 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Jan 2009 13:08:37 -0200
Subject: trace_sched_wakeup: Remove unused variable

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 93cecda..a48c9b4 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -184,13 +184,10 @@ out:
 
 static void __wakeup_reset(struct trace_array *tr)
 {
-	struct trace_array_cpu *data;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		data = tr->data[cpu];
+	for_each_possible_cpu(cpu)
 		tracing_reset(tr, cpu);
-	}
 
 	wakeup_cpu = -1;
 	wakeup_prio = -1;
-- 
cgit v1.1


From ecf441b593ac41cb8cd8cd3695110167c42e098c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 29 Jan 2009 13:49:45 -0800
Subject: kmemtrace: fix printk formats, fix

Geert Uytterhoeven wrote:

> %4zu?

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 72b326b..f04c062 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.1


From b2821ae68b14480bfc85ea1629537163310bc5cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Feb 2009 21:38:32 -0500
Subject: trace: fix default boot up tracer

Peter Zijlstra started the functionality to start up a default
tracing at bootup. This patch finishes the work.

Now if you add 'ftrace=<tracer>' to the command line, when that tracer
is registered on bootup, that tracer is selected and starts tracing.

Note, all selftests for tracers that are registered after this tracer
is disabled. This prevents the selftests from disturbing the running
tracer, or the running tracer from disturbing the selftest.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2f8ac1f..2c720c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -53,6 +53,11 @@ unsigned long __read_mostly	tracing_thresh;
  */
 static bool __read_mostly tracing_selftest_running;
 
+/*
+ * If a tracer is running, we do not want to run SELFTEST.
+ */
+static bool __read_mostly tracing_selftest_disabled;
+
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
 	{ }
@@ -110,14 +115,19 @@ static cpumask_var_t __read_mostly	tracing_buffer_mask;
  */
 int ftrace_dump_on_oops;
 
-static int tracing_set_tracer(char *buf);
+static int tracing_set_tracer(const char *buf);
+
+#define BOOTUP_TRACER_SIZE		100
+static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+static char *default_bootup_tracer;
 
 static int __init set_ftrace(char *str)
 {
-	tracing_set_tracer(str);
+	strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+	default_bootup_tracer = bootup_tracer_buf;
 	return 1;
 }
-__setup("ftrace", set_ftrace);
+__setup("ftrace=", set_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
@@ -468,7 +478,7 @@ int register_tracer(struct tracer *type)
 			type->flags->opts = dummy_tracer_opt;
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-	if (type->selftest) {
+	if (type->selftest && !tracing_selftest_disabled) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
 		int i;
@@ -510,8 +520,25 @@ int register_tracer(struct tracer *type)
  out:
 	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
-	lock_kernel();
 
+	if (!ret && default_bootup_tracer) {
+		if (!strncmp(default_bootup_tracer, type->name,
+			     BOOTUP_TRACER_SIZE)) {
+			printk(KERN_INFO "Starting tracer '%s'\n",
+			       type->name);
+			/* Do we want this tracer to start on bootup? */
+			tracing_set_tracer(type->name);
+			default_bootup_tracer = NULL;
+			/* disable other selftests, since this will break it. */
+			tracing_selftest_disabled = 1;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+			printk(KERN_INFO "Disabling FTRACE selftests due"
+			       " to running tracer '%s'\n", type->name);
+#endif
+		}
+	}
+
+	lock_kernel();
 	return ret;
 }
 
@@ -2245,7 +2272,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static int tracing_set_tracer(char *buf)
+static int tracing_set_tracer(const char *buf)
 {
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
@@ -3163,5 +3190,26 @@ out_free_buffer_mask:
 out:
 	return ret;
 }
+
+__init static int clear_boot_tracer(void)
+{
+	/*
+	 * The default tracer at boot buffer is an init section.
+	 * This function is called in lateinit. If we did not
+	 * find the boot tracer, then clear it out, to prevent
+	 * later registration from accessing the buffer that is
+	 * about to be freed.
+	 */
+	if (!default_bootup_tracer)
+		return 0;
+
+	printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
+	       default_bootup_tracer);
+	default_bootup_tracer = NULL;
+
+	return 0;
+}
+
 early_initcall(tracer_alloc_buffers);
 fs_initcall(tracer_init_debugfs);
+late_initcall(clear_boot_tracer);
-- 
cgit v1.1


From 79fb0768fbd371f3b94d909f51f587b3a24ab272 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Feb 2009 21:38:33 -0500
Subject: trace: let boot trace be chosen by command line

Now that we have a working ftrace=<tracer> function, make the boot
tracer get activated by it. This way we can turn it on or off without
recompiling the kernel, as well as keeping the selftests on. The
selftests are disabled whenever a default tracer starts running.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig      |  7 +++----
 kernel/trace/trace.c      |  5 +----
 kernel/trace/trace_boot.c | 11 +++++++----
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index dde1d46..28f2644 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,9 +164,8 @@ config BOOT_TRACER
 	  representation of the delays during initcalls - but the raw
 	  /debug/tracing/trace text output is readable too.
 
-	  ( Note that tracing self tests can't be enabled if this tracer is
-	    selected, because the self-tests are an initcall as well and that
-	    would invalidate the boot trace. )
+	  You must pass in ftrace=initcall to the kernel command line
+	  to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
@@ -326,7 +325,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
+	depends on TRACING && DEBUG_KERNEL
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c720c7..40edef4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3167,12 +3167,9 @@ __init static int tracer_alloc_buffers(void)
 	trace_init_cmdlines();
 
 	register_tracer(&nop_trace);
+	current_trace = &nop_trace;
 #ifdef CONFIG_BOOT_TRACER
 	register_tracer(&boot_tracer);
-	current_trace = &boot_tracer;
-	current_trace->init(&global_trace);
-#else
-	current_trace = &nop_trace;
 #endif
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0e94b3d..1f07895 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -28,13 +28,13 @@ void start_boot_trace(void)
 
 void enable_boot_trace(void)
 {
-	if (pre_initcalls_finished)
+	if (boot_trace && pre_initcalls_finished)
 		tracing_start_sched_switch_record();
 }
 
 void disable_boot_trace(void)
 {
-	if (pre_initcalls_finished)
+	if (boot_trace && pre_initcalls_finished)
 		tracing_stop_sched_switch_record();
 }
 
@@ -43,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
+	if (!tr)
+		return 0;
+
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
 
@@ -132,7 +135,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!pre_initcalls_finished)
+	if (!tr || !pre_initcalls_finished)
 		return;
 
 	/* Get its name now since this function could
@@ -164,7 +167,7 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!pre_initcalls_finished)
+	if (!tr || !pre_initcalls_finished)
 		return;
 
 	sprint_symbol(bt->func, (unsigned long)fn);
-- 
cgit v1.1


From c4a8e8be2d43cc22b371e8e9c05c253409759d94 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 2 Feb 2009 20:29:21 -0200
Subject: trace: better manage the context info for events

Impact: make trace_event more convenient for tracers

All tracers (for the moment) that use the struct trace_event want to
have the context info printed before their own output: the pid/cmdline,
cpu, and timestamp.

But some other tracers that want to implement their trace_event
callbacks will not necessary need these information or they may want to
format them as they want.

This patch adds a new default-enabled trace option:
TRACE_ITER_CONTEXT_INFO When disabled through:

echo nocontext-info > /debugfs/tracing/trace_options

The pid, cpu and timestamps headers will not be printed.

IE with the sched_switch tracer with context-info (default):

     bash-2935 [001] 100.356561: 2935:120:S ==> [001]  0:140:R <idle>
   <idle>-0    [000] 100.412804:    0:140:R   + [000] 11:115:S events/0
   <idle>-0    [000] 100.412816:    0:140:R ==> [000] 11:115:R events/0
 events/0-11   [000] 100.412829:   11:115:S ==> [000]  0:140:R <idle>

Without context-info:

 2935:120:S ==> [001]  0:140:R <idle>
    0:140:R   + [000] 11:115:S events/0
    0:140:R ==> [000] 11:115:R events/0
   11:115:S ==> [000]  0:140:R <idle>

A tracer can disable it at runtime by clearing the bit
TRACE_ITER_CONTEXT_INFO in trace_flags.

The print routines were renamed to trace_print_context and
trace_print_lat_context, so that they can be used by tracers if they
want to use them for one of the trace_event callbacks.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 149 +++++++++++---------------------------------
 kernel/trace/trace.h        |   7 ++-
 kernel/trace/trace_output.c | 107 +++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |   3 +
 4 files changed, 151 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2f8ac1f..5ec49c3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -227,7 +227,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -285,6 +285,7 @@ static const char *trace_options[] = {
 	"userstacktrace",
 	"sym-userobj",
 	"printk-msg-only",
+	"context-info",
 	NULL
 };
 
@@ -1171,8 +1172,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 }
 
 /* Find the next real entry, without updating the iterator itself */
-static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+					  int *ent_cpu, u64 *ent_ts)
 {
 	return __find_next_entry(iter, ent_cpu, ent_ts);
 }
@@ -1351,57 +1352,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "\n");
 }
 
-static void
-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
-{
-	int hardirq, softirq;
-	char *comm;
-
-	comm = trace_find_cmdline(entry->pid);
-
-	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "%3d", cpu);
-	trace_seq_printf(s, "%c%c",
-			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
-			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
-	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
-	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq) {
-		trace_seq_putc(s, 'H');
-	} else {
-		if (hardirq) {
-			trace_seq_putc(s, 'h');
-		} else {
-			if (softirq)
-				trace_seq_putc(s, 's');
-			else
-				trace_seq_putc(s, '.');
-		}
-	}
-
-	if (entry->preempt_count)
-		trace_seq_printf(s, "%x", entry->preempt_count);
-	else
-		trace_seq_puts(s, ".");
-}
-
-unsigned long preempt_mark_thresh = 100;
-
-static void
-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
-		    unsigned long rel_usecs)
-{
-	trace_seq_printf(s, " %4lldus", abs_usecs);
-	if (rel_usecs > preempt_mark_thresh)
-		trace_seq_puts(s, "!: ");
-	else if (rel_usecs > 1)
-		trace_seq_puts(s, "+: ");
-	else
-		trace_seq_puts(s, " : ");
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1419,46 +1369,24 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
-static enum print_line_t
-print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_entry *next_entry;
 	struct trace_event *event;
-	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
 	struct trace_entry *entry = iter->ent;
-	unsigned long abs_usecs;
-	unsigned long rel_usecs;
-	u64 next_ts;
-	char *comm;
 	int ret;
 
 	test_cpu_buff_start(iter);
 
-	next_entry = find_next_entry(iter, NULL, &next_ts);
-	if (!next_entry)
-		next_ts = iter->ts;
-	rel_usecs = ns2usecs(next_ts - iter->ts);
-	abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
-
-	if (verbose) {
-		comm = trace_find_cmdline(entry->pid);
-		trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
-				 " %ld.%03ldms (+%ld.%03ldms): ",
-				 comm,
-				 entry->pid, cpu, entry->flags,
-				 entry->preempt_count, trace_idx,
-				 ns2usecs(iter->ts),
-				 abs_usecs/1000,
-				 abs_usecs % 1000, rel_usecs/1000,
-				 rel_usecs % 1000);
-	} else {
-		lat_print_generic(s, entry, cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	event = ftrace_find_event(entry->type);
+
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_print_lat_context(iter);
+		if (ret)
+			return ret;
 	}
 
-	event = ftrace_find_event(entry->type);
 	if (event && event->latency_trace) {
 		ret = event->latency_trace(s, entry, sym_flags);
 		if (ret)
@@ -1476,33 +1404,20 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	struct trace_event *event;
-	unsigned long usec_rem;
-	unsigned long long t;
-	unsigned long secs;
-	char *comm;
 	int ret;
 
 	entry = iter->ent;
 
 	test_cpu_buff_start(iter);
 
-	comm = trace_find_cmdline(iter->ent->pid);
-
-	t = ns2usecs(iter->ts);
-	usec_rem = do_div(t, 1000000ULL);
-	secs = (unsigned long)t;
+	event = ftrace_find_event(entry->type);
 
-	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_print_context(iter);
+		if (ret)
+			return ret;
+	}
 
-	event = ftrace_find_event(entry->type);
 	if (event && event->trace) {
 		ret = event->trace(s, entry, sym_flags);
 		if (ret)
@@ -1525,10 +1440,12 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	ret = trace_seq_printf(s, "%d %d %llu ",
-		entry->pid, iter->cpu, iter->ts);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_seq_printf(s, "%d %d %llu ",
+			entry->pid, iter->cpu, iter->ts);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->raw) {
@@ -1553,9 +1470,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
-	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
-	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+		SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+		SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex)
@@ -1575,7 +1494,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1590,9 +1509,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	SEQ_PUT_FIELD_RET(s, entry->pid);
-	SEQ_PUT_FIELD_RET(s, entry->cpu);
-	SEQ_PUT_FIELD_RET(s, iter->ts);
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		SEQ_PUT_FIELD_RET(s, entry->pid);
+		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->ts);
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
@@ -1643,7 +1564,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 		return print_raw_fmt(iter);
 
 	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-		return print_lat_fmt(iter, iter->idx, iter->cpu);
+		return print_lat_fmt(iter);
 
 	return print_trace_fmt(iter);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e603a29..f0c7a0f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,6 +405,10 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
+
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+					  int *ent_cpu, u64 *ent_ts);
+
 void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
@@ -591,7 +595,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_ANNOTATE		= 0x2000,
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
 	TRACE_ITER_SYM_USEROBJ          = 0x8000,
-	TRACE_ITER_PRINTK_MSGONLY	= 0x10000
+	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
+	TRACE_ITER_CONTEXT_INFO		= 0x20000 /* Print pid/cpu/time */
 };
 
 /*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1a4e144..a5752d4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,6 +286,113 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%3d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq) {
+		trace_seq_putc(s, 'H');
+	} else {
+		if (hardirq) {
+			trace_seq_putc(s, 'h');
+		} else {
+			if (softirq)
+				trace_seq_putc(s, 's');
+			else
+				trace_seq_putc(s, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		trace_seq_printf(s, "%x", entry->preempt_count);
+	else
+		trace_seq_puts(s, ".");
+}
+
+static unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+		    unsigned long rel_usecs)
+{
+	trace_seq_printf(s, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		trace_seq_puts(s, "!: ");
+	else if (rel_usecs > 1)
+		trace_seq_puts(s, "+: ");
+	else
+		trace_seq_puts(s, " : ");
+}
+
+int trace_print_context(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	char *comm = trace_find_cmdline(entry->pid);
+	unsigned long long t = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+	unsigned long secs = (unsigned long)t;
+
+	if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid))
+		goto partial;
+	if (!trace_seq_printf(s, "[%03d] ", entry->cpu))
+		goto partial;
+	if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem))
+		goto partial;
+
+	return 0;
+
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+int trace_print_lat_context(struct trace_iterator *iter)
+{
+	u64 next_ts;
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent,
+			   *next_entry = trace_find_next_entry(iter, NULL,
+							       &next_ts);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
+	unsigned long rel_usecs;
+
+	if (!next_entry)
+		next_ts = iter->ts;
+	rel_usecs = ns2usecs(next_ts - iter->ts);
+
+	if (verbose) {
+		char *comm = trace_find_cmdline(entry->pid);
+		trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, entry->cpu, entry->flags,
+				 entry->preempt_count, iter->idx,
+				 ns2usecs(iter->ts),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
+	} else {
+		lat_print_generic(s, entry, entry->cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	}
+
+	return 0;
+}
+
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
 static int task_state_char(unsigned long state)
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1cbab5e..ec2ed90 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,6 +33,9 @@ int seq_print_userip_objs(const struct userstack_entry *entry,
 int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 		      unsigned long ip, unsigned long sym_flags);
 
+int trace_print_context(struct trace_iterator *iter);
+int trace_print_lat_context(struct trace_iterator *iter);
+
 struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
-- 
cgit v1.1


From 2c9b238eb325895d3312dad64e2685783575e474 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 2 Feb 2009 20:30:12 -0200
Subject: trace: Change struct trace_event callbacks parameter list

Impact: API change

The trace_seq and trace_entry are in trace_iterator, where there are
more fields that may be needed by tracers, so just pass the
tracer_iterator as is already the case for struct tracer->print_line.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        |  10 ++--
 kernel/trace/trace_branch.c |   7 +--
 kernel/trace/trace_output.c | 139 ++++++++++++++++++++------------------------
 kernel/trace/trace_output.h |   6 +-
 4 files changed, 72 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5ec49c3..152d096 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1388,7 +1388,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	}
 
 	if (event && event->latency_trace) {
-		ret = event->latency_trace(s, entry, sym_flags);
+		ret = event->latency_trace(iter, sym_flags);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1419,7 +1419,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	}
 
 	if (event && event->trace) {
-		ret = event->trace(s, entry, sym_flags);
+		ret = event->trace(iter, sym_flags);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1449,7 +1449,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->raw) {
-		ret = event->raw(s, entry, 0);
+		ret = event->raw(iter, 0);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1478,7 +1478,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex)
-		event->hex(s, entry, 0);
+		event->hex(iter, 0);
 
 	SEQ_PUT_FIELD_RET(s, newline);
 
@@ -1517,7 +1517,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
-		event->binary(s, entry, 0);
+		event->binary(iter, 0);
 
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 1284145..ea62f10 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -160,14 +160,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_branch_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_branch *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+	if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
 			     field->correct ? "  ok  " : " MISS ",
 			     field->func,
 			     field->file,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5752d4..c24503b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -484,19 +484,18 @@ int unregister_ftrace_event(struct trace_event *event)
  * Standard events
  */
 
-int
-trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+int trace_nop_print(struct trace_iterator *iter, int flags)
 {
 	return 0;
 }
 
 /* TRACE_FN */
-static int
-trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_latency(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -513,12 +512,12 @@ trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -540,14 +539,13 @@ trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_raw(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "%lx %lx\n",
+	if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
 			      field->ip,
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -555,12 +553,12 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_hex(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
 	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
@@ -568,12 +566,12 @@ trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_bin(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->ip);
 	SEQ_PUT_FIELD_RET(s, field->parent_ip);
@@ -591,20 +589,19 @@ static struct trace_event trace_fn_event = {
 };
 
 /* TRACE_CTX an TRACE_WAKE */
-static int
-trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
-		    char *delim)
+static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 {
 	struct ctx_switch_entry *field;
 	char *comm;
 	int S, T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
 	comm = trace_find_cmdline(field->next_pid);
-	if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+	if (!trace_seq_printf(&iter->seq,
+			      " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 			      field->prev_pid,
 			      field->prev_prio,
 			      S, delim,
@@ -617,31 +614,27 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_print(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_print(s, entry, flags, "==>");
+	return trace_ctxwake_print(iter, "==>");
 }
 
-static int
-trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_print(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_print(s, entry, flags, "  +");
+	return trace_ctxwake_print(iter, "  +");
 }
 
-static int
-trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
-		  char S)
+static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 {
 	struct ctx_switch_entry *field;
 	int T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!S)
 		task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
-	if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+	if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			      field->prev_pid,
 			      field->prev_prio,
 			      S,
@@ -654,27 +647,24 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_raw(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_raw(s, entry, flags, 0);
+	return trace_ctxwake_raw(iter, 0);
 }
 
-static int
-trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_raw(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_raw(s, entry, flags, '+');
+	return trace_ctxwake_raw(iter, '+');
 }
 
 
-static int
-trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
-		  char S)
+static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 {
 	struct ctx_switch_entry *field;
+	struct trace_seq *s = &iter->seq;
 	int T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!S)
 		task_state_char(field->prev_state);
@@ -691,24 +681,22 @@ trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_hex(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_hex(s, entry, flags, 0);
+	return trace_ctxwake_hex(iter, 0);
 }
 
-static int
-trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_hex(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_hex(s, entry, flags, '+');
+	return trace_ctxwake_hex(iter, '+');
 }
 
-static int
-trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
 {
 	struct ctx_switch_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->prev_pid);
 	SEQ_PUT_FIELD_RET(s, field->prev_prio);
@@ -739,14 +727,13 @@ static struct trace_event trace_wake_event = {
 };
 
 /* TRACE_SPECIAL */
-static int
-trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_print(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
 			      field->arg1,
 			      field->arg2,
 			      field->arg3))
@@ -755,12 +742,12 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_hex(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
@@ -769,12 +756,12 @@ trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_bin(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->arg1);
 	SEQ_PUT_FIELD_RET(s, field->arg2);
@@ -794,13 +781,13 @@ static struct trace_event trace_special_event = {
 
 /* TRACE_STACK */
 
-static int
-trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_stack_print(struct trace_iterator *iter, int flags)
 {
 	struct stack_entry *field;
+	struct trace_seq *s = &iter->seq;
 	int i;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (i) {
@@ -830,13 +817,12 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
-static int
-trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
-		       int flags)
+static int trace_user_stack_print(struct trace_iterator *iter, int flags)
 {
 	struct userstack_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
@@ -860,12 +846,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static int
-trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -879,14 +865,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_print_raw(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ec2ed90..3aeb31f 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -3,8 +3,7 @@
 
 #include "trace.h"
 
-typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
-				int flags);
+typedef int (*trace_print_func)(struct trace_iterator *iter, int flags);
 
 struct trace_event {
 	struct hlist_node	node;
@@ -40,8 +39,7 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
-int
-trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+int trace_nop_print(struct trace_iterator *iter, int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.1


From d9793bd8018f835c64b10f44e278c86cecb8e932 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 3 Feb 2009 20:20:41 -0200
Subject: trace: judicious error checking of trace_seq results

Impact: bugfix and cleanup

Some callsites were returning either TRACE_ITER_PARTIAL_LINE if the
trace_seq routines (trace_seq_printf, etc) returned 0 meaning its buffer
was full, or zero otherwise.

But...

/* Return values for print_line callback */
enum print_line_t {
        TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
        TRACE_TYPE_HANDLED      = 1,
        TRACE_TYPE_UNHANDLED    = 2     /* Relay to other output functions */
};

In other cases the return value was not being relayed at all.

Most of the time it didn't hurt because the page wasn't get filled, but
for correctness sake, handle the return values everywhere.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        |  75 ++++++++++++---------------
 kernel/trace/trace_branch.c |   2 +-
 kernel/trace/trace_output.c | 123 ++++++++++++++++++--------------------------
 3 files changed, 86 insertions(+), 114 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bbdfaa2..5822ff4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1402,27 +1402,25 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_event *event;
 	struct trace_entry *entry = iter->ent;
-	int ret;
 
 	test_cpu_buff_start(iter);
 
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_print_lat_context(iter);
-		if (ret)
-			return ret;
+		if (!trace_print_lat_context(iter))
+			goto partial;
 	}
 
-	if (event && event->latency_trace) {
-		ret = event->latency_trace(iter, sym_flags);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
+	if (event && event->latency_trace)
+		return event->latency_trace(iter, sym_flags);
+
+	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+		goto partial;
 
-	trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1431,7 +1429,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	struct trace_event *event;
-	int ret;
 
 	entry = iter->ent;
 
@@ -1440,22 +1437,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_print_context(iter);
-		if (ret)
-			return ret;
+		if (!trace_print_context(iter))
+			goto partial;
 	}
 
-	if (event && event->trace) {
-		ret = event->trace(iter, sym_flags);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
-	ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (event && event->trace)
+		return event->trace(iter, sym_flags);
+
+	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+		goto partial;
 
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -1463,29 +1457,25 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
 	struct trace_event *event;
-	int ret;
 
 	entry = iter->ent;
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_seq_printf(s, "%d %d %llu ",
-			entry->pid, iter->cpu, iter->ts);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+		if (!trace_seq_printf(s, "%d %d %llu ",
+				      entry->pid, iter->cpu, iter->ts))
+			goto partial;
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->raw) {
-		ret = event->raw(iter, 0);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
-	ret = trace_seq_printf(s, "%d ?\n", entry->type);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (event && event->raw)
+		return event->raw(iter, 0);
+
+	if (!trace_seq_printf(s, "%d ?\n", entry->type))
+		goto partial;
 
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -1504,8 +1494,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->hex)
-		event->hex(iter, 0);
+	if (event && event->hex) {
+		int ret = event->hex(iter, 0);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+	}
 
 	SEQ_PUT_FIELD_RET(s, newline);
 
@@ -1544,7 +1537,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
-		event->binary(iter, 0);
+		return event->binary(iter, 0);
 
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ea62f10..f6b35e1 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -173,7 +173,7 @@ static int trace_branch_print(struct trace_iterator *iter, int flags)
 			     field->line))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c24503b..5b3c914 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,55 +286,41 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
-static void
+static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
 	char *comm;
 
 	comm = trace_find_cmdline(entry->pid);
-
-	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "%3d", cpu);
-	trace_seq_printf(s, "%c%c",
-			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
-			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq) {
-		trace_seq_putc(s, 'H');
-	} else {
-		if (hardirq) {
-			trace_seq_putc(s, 'h');
-		} else {
-			if (softirq)
-				trace_seq_putc(s, 's');
-			else
-				trace_seq_putc(s, '.');
-		}
-	}
+
+	if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
+			      comm, entry->pid, cpu,
+			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+				  'X' : '.',
+			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
+				'N' : '.',
+			      (hardirq && softirq) ? 'H' :
+				hardirq ? 'h' : softirq ? 's' : '.'))
+		return 0;
 
 	if (entry->preempt_count)
-		trace_seq_printf(s, "%x", entry->preempt_count);
-	else
-		trace_seq_puts(s, ".");
+		return trace_seq_printf(s, "%x", entry->preempt_count);
+	return trace_seq_puts(s, ".");
 }
 
 static unsigned long preempt_mark_thresh = 100;
 
-static void
+static int
 lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 		    unsigned long rel_usecs)
 {
-	trace_seq_printf(s, " %4lldus", abs_usecs);
-	if (rel_usecs > preempt_mark_thresh)
-		trace_seq_puts(s, "!: ");
-	else if (rel_usecs > 1)
-		trace_seq_puts(s, "+: ");
-	else
-		trace_seq_puts(s, " : ");
+	return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
+				rel_usecs > preempt_mark_thresh ? '!' :
+				  rel_usecs > 1 ? '+' : ' ');
 }
 
 int trace_print_context(struct trace_iterator *iter)
@@ -346,22 +332,14 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
 	unsigned long secs = (unsigned long)t;
 
-	if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid))
-		goto partial;
-	if (!trace_seq_printf(s, "[%03d] ", entry->cpu))
-		goto partial;
-	if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem))
-		goto partial;
-
-	return 0;
-
-partial:
-	return TRACE_TYPE_PARTIAL_LINE;
+	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
+				comm, entry->pid, entry->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
 {
 	u64 next_ts;
+	int ret;
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent,
 			   *next_entry = trace_find_next_entry(iter, NULL,
@@ -376,21 +354,22 @@ int trace_print_lat_context(struct trace_iterator *iter)
 
 	if (verbose) {
 		char *comm = trace_find_cmdline(entry->pid);
-		trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
-				 " %ld.%03ldms (+%ld.%03ldms): ",
-				 comm,
-				 entry->pid, entry->cpu, entry->flags,
-				 entry->preempt_count, iter->idx,
-				 ns2usecs(iter->ts),
-				 abs_usecs/1000,
-				 abs_usecs % 1000, rel_usecs/1000,
-				 rel_usecs % 1000);
+		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
+				       entry->pid, entry->cpu, entry->flags,
+				       entry->preempt_count, iter->idx,
+				       ns2usecs(iter->ts),
+				       abs_usecs / USEC_PER_MSEC,
+				       abs_usecs % USEC_PER_MSEC,
+				       rel_usecs / USEC_PER_MSEC,
+				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		lat_print_generic(s, entry, entry->cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+		ret = lat_print_generic(s, entry, entry->cpu);
+		if (ret)
+			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
 
-	return 0;
+	return ret;
 }
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -486,7 +465,7 @@ int unregister_ftrace_event(struct trace_event *event)
 
 int trace_nop_print(struct trace_iterator *iter, int flags)
 {
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
@@ -506,7 +485,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")\n"))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -533,7 +512,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(s, "\n"))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -550,7 +529,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags)
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_fn_hex(struct trace_iterator *iter, int flags)
@@ -563,7 +542,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags)
 	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
 	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_fn_bin(struct trace_iterator *iter, int flags)
@@ -576,7 +555,7 @@ static int trace_fn_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->ip);
 	SEQ_PUT_FIELD_RET(s, field->parent_ip);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_fn_event = {
@@ -611,7 +590,7 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 			      T, comm))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_print(struct trace_iterator *iter, int flags)
@@ -644,7 +623,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 			      T))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_raw(struct trace_iterator *iter, int flags)
@@ -678,7 +657,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
 	SEQ_PUT_HEX_FIELD_RET(s, T);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_hex(struct trace_iterator *iter, int flags)
@@ -705,7 +684,7 @@ static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->next_prio);
 	SEQ_PUT_FIELD_RET(s, field->next_state);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_ctx_event = {
@@ -739,7 +718,7 @@ static int trace_special_print(struct trace_iterator *iter, int flags)
 			      field->arg3))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_special_hex(struct trace_iterator *iter, int flags)
@@ -753,7 +732,7 @@ static int trace_special_hex(struct trace_iterator *iter, int flags)
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_special_bin(struct trace_iterator *iter, int flags)
@@ -767,7 +746,7 @@ static int trace_special_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->arg2);
 	SEQ_PUT_FIELD_RET(s, field->arg3);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_special_event = {
@@ -801,7 +780,7 @@ static int trace_stack_print(struct trace_iterator *iter, int flags)
 			goto partial;
 	}
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -830,7 +809,7 @@ static int trace_user_stack_print(struct trace_iterator *iter, int flags)
 	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -859,7 +838,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(s, ": %s", field->buf))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -874,7 +853,7 @@ static int trace_print_raw(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.1


From ae7462b4f1fe1f36b5d562dbd5202a2eba01f072 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 3 Feb 2009 22:05:50 -0200
Subject: trace: make the trace_event callbacks return enum print_line_t

As they actually all return these enumerators.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace_branch.c |  3 ++-
 kernel/trace/trace_output.c | 52 +++++++++++++++++++++++++++------------------
 kernel/trace/trace_output.h |  5 +++--
 4 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5822ff4..fd51cf0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1495,7 +1495,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex) {
-		int ret = event->hex(iter, 0);
+		enum print_line_t ret = event->hex(iter, 0);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
 	}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index f6b35e1..7ac72a4 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -160,7 +160,8 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_branch_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_branch_print(struct trace_iterator *iter,
+					    int flags)
 {
 	struct trace_branch *field;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5b3c914..b7380ee 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -463,13 +463,14 @@ int unregister_ftrace_event(struct trace_event *event)
  * Standard events
  */
 
-int trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
 {
 	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
-static int trace_fn_latency(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_latency(struct trace_iterator *iter,
+					  int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -491,7 +492,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -518,7 +519,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 
@@ -532,7 +533,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -545,7 +546,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -568,7 +569,8 @@ static struct trace_event trace_fn_event = {
 };
 
 /* TRACE_CTX an TRACE_WAKE */
-static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
+static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
+					     char *delim)
 {
 	struct ctx_switch_entry *field;
 	char *comm;
@@ -593,12 +595,13 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_print(iter, "==>");
 }
 
-static int trace_wake_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_print(struct trace_iterator *iter,
+					  int flags)
 {
 	return trace_ctxwake_print(iter, "  +");
 }
@@ -626,12 +629,12 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_raw(iter, 0);
 }
 
-static int trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_raw(iter, '+');
 }
@@ -660,17 +663,18 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_hex(iter, 0);
 }
 
-static int trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_hex(iter, '+');
 }
 
-static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
+					   int flags)
 {
 	struct ctx_switch_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -706,7 +710,8 @@ static struct trace_event trace_wake_event = {
 };
 
 /* TRACE_SPECIAL */
-static int trace_special_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_print(struct trace_iterator *iter,
+					     int flags)
 {
 	struct special_entry *field;
 
@@ -721,7 +726,8 @@ static int trace_special_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_special_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_hex(struct trace_iterator *iter,
+					   int flags)
 {
 	struct special_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -735,7 +741,8 @@ static int trace_special_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_special_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_bin(struct trace_iterator *iter,
+					   int flags)
 {
 	struct special_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -760,7 +767,8 @@ static struct trace_event trace_special_event = {
 
 /* TRACE_STACK */
 
-static int trace_stack_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_stack_print(struct trace_iterator *iter,
+					   int flags)
 {
 	struct stack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -796,7 +804,8 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
-static int trace_user_stack_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
+						int flags)
 {
 	struct userstack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -825,7 +834,8 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static int trace_print_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
 {
 	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -844,7 +854,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 3aeb31f..551a25a 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -3,7 +3,8 @@
 
 #include "trace.h"
 
-typedef int (*trace_print_func)(struct trace_iterator *iter, int flags);
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
 
 struct trace_event {
 	struct hlist_node	node;
@@ -39,7 +40,7 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
-int trace_nop_print(struct trace_iterator *iter, int flags);
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.1


From 268ccda0cb4d1292029d07ee3dbd07117baf6ecb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 4 Feb 2009 20:16:39 -0200
Subject: trace: assign defaults at register_ftrace_event

Impact: simplification of tracers

As all tracers are doing this we might as well do it in
register_ftrace_event and save one branch each time we call these
callbacks.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 13 +++++--------
 kernel/trace/trace_branch.c |  3 ---
 kernel/trace/trace_output.c | 13 +++++++++++--
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd51cf0..a5e4c0a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1412,7 +1412,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 			goto partial;
 	}
 
-	if (event && event->latency_trace)
+	if (event)
 		return event->latency_trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
@@ -1441,7 +1441,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			goto partial;
 	}
 
-	if (event && event->trace)
+	if (event)
 		return event->trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
@@ -1467,7 +1467,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->raw)
+	if (event)
 		return event->raw(iter, 0);
 
 	if (!trace_seq_printf(s, "%d ?\n", entry->type))
@@ -1494,7 +1494,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->hex) {
+	if (event) {
 		enum print_line_t ret = event->hex(iter, 0);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
@@ -1536,10 +1536,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->binary)
-		return event->binary(iter, 0);
-
-	return TRACE_TYPE_HANDLED;
+	return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
 }
 
 static int trace_empty(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7ac72a4..297deb2 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -182,9 +182,6 @@ static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 	.latency_trace	= trace_branch_print,
-	.raw		= trace_nop_print,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
 };
 
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b7380ee..b6e99af 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -435,6 +435,17 @@ int register_ftrace_event(struct trace_event *event)
 	if (ftrace_find_event(event->type))
 		goto out;
 
+	if (event->trace == NULL)
+		event->trace = trace_nop_print;
+	if (event->latency_trace == NULL)
+		event->latency_trace = trace_nop_print;
+	if (event->raw == NULL)
+		event->raw = trace_nop_print;
+	if (event->hex == NULL)
+		event->hex = trace_nop_print;
+	if (event->binary == NULL)
+		event->binary = trace_nop_print;
+
 	key = event->type & (EVENT_HASHSIZE - 1);
 
 	hlist_add_head_rcu(&event->node, &event_hash[key]);
@@ -874,8 +885,6 @@ static struct trace_event trace_print_event = {
 	.trace		= trace_print_print,
 	.latency_trace	= trace_print_print,
 	.raw		= trace_print_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
 };
 
 static struct trace_event *events[] __initdata = {
-- 
cgit v1.1


From 97e5b191ae7dc0f4f5b82b9db29782928b103b4d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 01:13:36 -0500
Subject: trace_branch: Remove unused function

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 297deb2..027e836 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,23 +143,6 @@ static void branch_trace_reset(struct trace_array *tr)
 	stop_branch_trace(tr);
 }
 
-static int
-trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, entry);
-
-	if (seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 					    int flags)
 {
-- 
cgit v1.1


From 7be421510b91491d5aa5a29fa1005712039b95af Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 01:13:37 -0500
Subject: trace: Remove unused trace_array_cpu parameter

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 47 +++++++++++++++------------------------
 kernel/trace/trace.h              |  4 ----
 kernel/trace/trace_functions.c    |  8 +++----
 kernel/trace/trace_irqsoff.c      | 10 ++++-----
 kernel/trace/trace_sched_switch.c |  4 ++--
 kernel/trace/trace_sched_wakeup.c | 12 +++++-----
 6 files changed, 34 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a5e4c0a..1d4ff56 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -776,7 +776,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 void
-trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
@@ -802,7 +802,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static void __trace_graph_entry(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -826,7 +825,6 @@ static void __trace_graph_entry(struct trace_array *tr,
 }
 
 static void __trace_graph_return(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
@@ -856,11 +854,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        int pc)
 {
 	if (likely(!atomic_read(&data->disabled)))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 }
 
 static void __ftrace_trace_stack(struct trace_array *tr,
-				 struct trace_array_cpu *data,
 				 unsigned long flags,
 				 int skip, int pc)
 {
@@ -891,27 +888,24 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 }
 
 static void ftrace_trace_stack(struct trace_array *tr,
-			       struct trace_array_cpu *data,
 			       unsigned long flags,
 			       int skip, int pc)
 {
 	if (!(trace_flags & TRACE_ITER_STACKTRACE))
 		return;
 
-	__ftrace_trace_stack(tr, data, flags, skip, pc);
+	__ftrace_trace_stack(tr, flags, skip, pc);
 }
 
 void __trace_stack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
 		   unsigned long flags,
 		   int skip, int pc)
 {
-	__ftrace_trace_stack(tr, data, flags, skip, pc);
+	__ftrace_trace_stack(tr, flags, skip, pc);
 }
 
 static void ftrace_trace_userstack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long flags, int pc)
+				   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
@@ -942,20 +936,17 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 #endif
 }
 
-void __trace_userstack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long flags)
+void __trace_userstack(struct trace_array *tr, unsigned long flags)
 {
-	ftrace_trace_userstack(tr, data, flags, preempt_count());
+	ftrace_trace_userstack(tr, flags, preempt_count());
 }
 
 static void
-ftrace_trace_special(void *__tr, void *__data,
+ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
 	struct ring_buffer_event *event;
-	struct trace_array_cpu *data = __data;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
 	unsigned long irq_flags;
@@ -971,8 +962,8 @@ ftrace_trace_special(void *__tr, void *__data,
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, data, irq_flags, pc);
+	ftrace_trace_stack(tr, irq_flags, 4, pc);
+	ftrace_trace_userstack(tr, irq_flags, pc);
 
 	trace_wake_up();
 }
@@ -981,12 +972,11 @@ void
 __trace_special(void *__tr, void *__data,
 		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
-	ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
+	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
 }
 
 void
 tracing_sched_switch_trace(struct trace_array *tr,
-			   struct trace_array_cpu *data,
 			   struct task_struct *prev,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
@@ -1010,13 +1000,12 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, flags, 5, pc);
-	ftrace_trace_userstack(tr, data, flags, pc);
+	ftrace_trace_stack(tr, flags, 5, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 }
 
 void
 tracing_sched_wakeup_trace(struct trace_array *tr,
-			   struct trace_array_cpu *data,
 			   struct task_struct *wakee,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
@@ -1040,8 +1029,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, flags, 6, pc);
-	ftrace_trace_userstack(tr, data, flags, pc);
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 
 	trace_wake_up();
 }
@@ -1064,7 +1053,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	data = tr->data[cpu];
 
 	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
+		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -1092,7 +1081,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_entry(tr, data, trace, flags, pc);
+		__trace_graph_entry(tr, trace, flags, pc);
 	}
 	/* Only do the atomic if it is not already set */
 	if (!test_tsk_trace_graph(current))
@@ -1118,7 +1107,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_return(tr, data, trace, flags, pc);
+		__trace_graph_return(tr, trace, flags, pc);
 	}
 	if (!trace->depth)
 		clear_tsk_trace_graph(current);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f0c7a0f..df627a9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,14 +419,12 @@ void ftrace(struct trace_array *tr,
 			    unsigned long parent_ip,
 			    unsigned long flags, int pc);
 void tracing_sched_switch_trace(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
@@ -436,7 +434,6 @@ void trace_special(struct trace_array *tr,
 		   unsigned long arg2,
 		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
-		    struct trace_array_cpu *data,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
@@ -462,7 +459,6 @@ void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
 void __trace_stack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
 		   unsigned long flags,
 		   int skip, int pc);
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3a320f..d067cea 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -78,7 +78,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
 	ftrace_preempt_enable(resched);
@@ -108,7 +108,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 	}
 
 	atomic_dec(&data->disabled);
@@ -139,7 +139,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 		/*
 		 * skip over 5 funcs:
 		 *    __ftrace_trace_stack,
@@ -148,7 +148,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 		 *    ftrace_list_func
 		 *    ftrace_call
 		 */
-		__trace_stack(tr, data, flags, 5, pc);
+		__trace_stack(tr, flags, 5, pc);
 	}
 
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ed344b0..c6b442d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+		trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	atomic_dec(&data->disabled);
 }
@@ -153,7 +153,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 
 	latency = nsecs_to_usecs(delta);
 
@@ -177,7 +177,7 @@ out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_reset(tr, cpu);
-	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -210,7 +210,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -244,7 +244,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	local_save_flags(flags);
-	trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb..c4f9add 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -43,7 +43,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	data = ctx_trace->data[cpu];
 
 	if (likely(!atomic_read(&data->disabled)))
-		tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
+		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
 
 	local_irq_restore(flags);
 }
@@ -66,7 +66,7 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	data = ctx_trace->data[cpu];
 
 	if (likely(!atomic_read(&data->disabled)))
-		tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
+		tracing_sched_wakeup_trace(ctx_trace, wakee, current,
 					   flags, pc);
 
 	local_irq_restore(flags);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index a48c9b4..96d7164 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -72,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	if (task_cpu(wakeup_task) != cpu)
 		goto unlock;
 
-	trace_function(tr, data, ip, parent_ip, flags, pc);
+	trace_function(tr, ip, parent_ip, flags, pc);
 
  unlock:
 	__raw_spin_unlock(&wakeup_lock);
@@ -152,8 +152,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
-	tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -254,10 +254,8 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 
 	data = wakeup_trace->data[wakeup_cpu];
 	data->preempt_timestamp = ftrace_now(cpu);
-	tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
-				   flags, pc);
-	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
-		       flags, pc);
+	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
 	__raw_spin_unlock(&wakeup_lock);
-- 
cgit v1.1


From dac74940289f350c2590bec92737833bad608541 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 01:13:38 -0500
Subject: trace: code style clean up

Ingo Molnar suggested using goto logic to keep the indentation
down and to be able to remove the nasty line breaks. This actually
makes the code a bit more readable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1d4ff56..3536ef4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -522,23 +522,24 @@ int register_tracer(struct tracer *type)
 	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
 
-	if (!ret && default_bootup_tracer) {
-		if (!strncmp(default_bootup_tracer, type->name,
-			     BOOTUP_TRACER_SIZE)) {
-			printk(KERN_INFO "Starting tracer '%s'\n",
-			       type->name);
-			/* Do we want this tracer to start on bootup? */
-			tracing_set_tracer(type->name);
-			default_bootup_tracer = NULL;
-			/* disable other selftests, since this will break it. */
-			tracing_selftest_disabled = 1;
+	if (ret || !default_bootup_tracer)
+		goto out_unlock;
+
+	if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+		goto out_unlock;
+
+	printk(KERN_INFO "Starting tracer '%s'\n", type->name);
+	/* Do we want this tracer to start on bootup? */
+	tracing_set_tracer(type->name);
+	default_bootup_tracer = NULL;
+	/* disable other selftests, since this will break it. */
+	tracing_selftest_disabled = 1;
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-			printk(KERN_INFO "Disabling FTRACE selftests due"
-			       " to running tracer '%s'\n", type->name);
+	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
+	       type->name);
 #endif
-		}
-	}
 
+ out_unlock:
 	lock_kernel();
 	return ret;
 }
-- 
cgit v1.1


From ce9dbe244bf2063c41792e40dae7745957b118e0 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Wed, 4 Feb 2009 20:35:48 -0800
Subject: softlockup: check all tasks in hung_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: extend the scope of hung-task checks

Changed the default value of hung_task_check_count to PID_MAX_LIMIT.
hung_task_batch_count added to put an upper bound on the critical
section. Every hung_task_batch_count checks, the rcu lock is never
held for a too long time.

Keeping the critical section small minimizes time preemption is disabled
and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Suggested-by: Frédéric Weisbecker <fweisbec@gmail.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..481ca8b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,9 +17,18 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
+ * The number of tasks checked:
  */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -110,6 +119,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 }
 
 /*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
+/*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
  * a warning.
@@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
+	int batch_count = HUNG_TASK_BATCHING;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
@@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
+		if (!--batch_count) {
+			batch_count = HUNG_TASK_BATCHING;
+			rcu_lock_break(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
cgit v1.1


From 94be52dc075a32af4aa73d7e10f68734d62d6af2 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 5 Feb 2009 09:56:08 -0800
Subject: softlockup: convert read_lock in hung_task to rcu_read_lock

Since the tasklist is protected by rcu list operations, it is safe
to convert the read_lock()s to rcu_read_lock().

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 481ca8b..3951a80 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
@@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void update_poll_jiffies(void)
-- 
cgit v1.1


From 0a9877514c4fed10a70720293b37213dd172ee3e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:12:56 -0200
Subject: ring_buffer: remove unused flags parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: API change, cleanup

>From ring_buffer_{lock_reserve,unlock_commit}.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |  -14
  trace_graph_return         |  -14
  trace_graph_entry          |  -10
  trace_function             |   -8
  __ftrace_trace_stack       |   -8
  ftrace_trace_userstack     |   -8
  tracing_sched_switch_trace |   -8
  ftrace_trace_special       |  -12
  tracing_sched_wakeup_trace |   -8
 9 functions changed, 90 bytes removed, diff: -90

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |   -1
 1 function changed, 1 bytes removed, diff: -1

/tmp/vmlinux.after:
 10 functions changed, 91 bytes removed, diff: -91

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c         | 12 +++------
 kernel/trace/ring_buffer.c       |  9 ++-----
 kernel/trace/trace.c             | 56 ++++++++++++++--------------------------
 kernel/trace/trace_boot.c        | 12 +++------
 kernel/trace/trace_branch.c      |  7 +++--
 kernel/trace/trace_hw_branches.c |  6 ++---
 kernel/trace/trace_mmiotrace.c   | 12 +++------
 kernel/trace/trace_power.c       | 12 +++------
 8 files changed, 44 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index f04c062..256749d 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -272,13 +272,11 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -292,7 +290,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -305,13 +303,11 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -322,7 +318,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d737..aee76b3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1257,7 +1257,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
  * @length: the length of the data to reserve (excluding event header)
- * @flags: a pointer to save the interrupt flags
  *
  * Returns a reseverd event on the ring buffer to copy directly to.
  * The user of this interface will need to get the body to write into
@@ -1270,9 +1269,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * If NULL is returned, then nothing has been allocated or locked.
  */
 struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags)
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1339,15 +1336,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
  * @event: The event pointer to commit.
- * @flags: the interrupt flags received from ring_buffer_lock_reserve.
  *
  * This commits the data to the ring buffer, and releases any locks held.
  *
  * Must be paired with ring_buffer_lock_reserve.
  */
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags)
+			      struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3536ef4..eb453a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -783,14 +783,12 @@ trace_function(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long irq_flags;
 
 	/* If we are reading the ring buffer, don't trace */
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -798,7 +796,7 @@ trace_function(struct trace_array *tr,
 	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -809,20 +807,18 @@ static void __trace_graph_entry(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -832,20 +828,18 @@ static void __trace_graph_return(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -866,10 +860,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -884,7 +876,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -912,13 +904,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -933,7 +923,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -950,10 +940,8 @@ ftrace_trace_special(void *__tr,
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -962,9 +950,9 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, irq_flags, pc);
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, 0, 4, pc);
+	ftrace_trace_userstack(tr, 0, pc);
 
 	trace_wake_up();
 }
@@ -984,10 +972,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1000,7 +986,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 5, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1013,10 +999,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1029,7 +1013,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 
@@ -2841,7 +2825,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, size);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -2852,7 +2836,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 1f07895..4e08deb 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -132,7 +132,6 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_call *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -144,15 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -164,7 +162,6 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_ret *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -173,15 +170,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 027e836..770e52a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -33,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
+	unsigned long flags;
 	int cpu, pc;
 	const char *p;
 
@@ -52,8 +52,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 
@@ -75,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545..e720c00 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -175,7 +175,7 @@ void trace_hw_branch(u64 from, u64 to)
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq1, irq2;
+	unsigned long irq1;
 	int cpu;
 
 	if (unlikely(!tr))
@@ -189,7 +189,7 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -198,7 +198,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ec78e24..104ddeb 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,10 +307,8 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -319,7 +317,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -337,10 +335,8 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -349,7 +345,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index faa6ab7..3b1a292 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -115,7 +115,6 @@ void trace_power_end(struct power_trace *it)
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -125,15 +124,14 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -148,7 +146,6 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -162,15 +159,14 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
-- 
cgit v1.1


From 51a763dd84253bab1d0a1e68e11a7753d1b702ca Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:14:13 -0200
Subject: tracing: Introduce trace_buffer_{lock_reserve,unlock_commit}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: new API

These new functions do what previously was being open coded, reducing
the number of details ftrace plugin writers have to worry about.

It also standardizes the handling of stacktrace, userstacktrace and
other trace options we may introduce in the future.

With this patch, for instance, the blk tracer (and some others already
in the tree) can use the "userstacktrace" /d/tracing/trace_options
facility.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |   -5
  trace_graph_return         |  -22
  trace_graph_entry          |  -26
  trace_function             |  -45
  __ftrace_trace_stack       |  -27
  ftrace_trace_userstack     |  -29
  tracing_sched_switch_trace |  -66
  tracing_stop               |   +1
  trace_seq_to_user          |   -1
  ftrace_trace_special       |  -63
  ftrace_special             |   +1
  tracing_sched_wakeup_trace |  -70
  tracing_reset_online_cpus  |   -1
 13 functions changed, 2 bytes added, 355 bytes removed, diff: -353

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |  -58
 1 function changed, 58 bytes removed, diff: -58

linux-2.6-tip/kernel/trace/trace.c:
  trace_buffer_lock_reserve  |  +88
  trace_buffer_unlock_commit |  +86
 2 functions changed, 174 bytes added, diff: +174

/tmp/vmlinux.after:
 16 functions changed, 176 bytes added, 413 bytes removed, diff: -237

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c         | 19 +++-----
 kernel/trace/trace.c             | 94 ++++++++++++++++++++++------------------
 kernel/trace/trace.h             | 11 +++++
 kernel/trace/trace_boot.c        | 20 +++------
 kernel/trace/trace_branch.c      |  7 ++-
 kernel/trace/trace_hw_branches.c |  7 ++-
 kernel/trace/trace_mmiotrace.c   | 20 ++++-----
 kernel/trace/trace_power.c       | 20 +++------
 8 files changed, 96 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 256749d..ae201b3 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -276,13 +276,12 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_ALLOC;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 	entry->bytes_req = bytes_req;
@@ -290,9 +289,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 }
 EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
 
@@ -307,20 +304,16 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type = TRACE_KMEM_FREE;
 	entry->type_id	= type_id;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 }
 EXPORT_SYMBOL(kmemtrace_mark_free);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb453a2..8fad377 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -776,6 +776,39 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
 
+struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
+						    unsigned char type,
+						    unsigned long len,
+						    unsigned long flags, int pc)
+{
+	struct ring_buffer_event *event;
+
+	event = ring_buffer_lock_reserve(tr->buffer, len);
+	if (event != NULL) {
+		struct trace_entry *ent = ring_buffer_event_data(event);
+
+		tracing_generic_entry_update(ent, flags, pc);
+		ent->type = type;
+	}
+
+	return event;
+}
+static void ftrace_trace_stack(struct trace_array *tr,
+			       unsigned long flags, int skip, int pc);
+static void ftrace_trace_userstack(struct trace_array *tr,
+				   unsigned long flags, int pc);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc)
+{
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
+	trace_wake_up();
+}
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -788,12 +821,11 @@ trace_function(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
+					  flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 	ring_buffer_unlock_commit(tr->buffer, event);
@@ -811,12 +843,11 @@ static void __trace_graph_entry(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
@@ -832,12 +863,11 @@ static void __trace_graph_return(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
@@ -861,13 +891,11 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct stack_entry *entry;
 	struct stack_trace trace;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_STACK,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_STACK;
-
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
 	trace.nr_entries	= 0;
@@ -908,12 +936,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_USER_STACK;
 
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
@@ -941,20 +968,15 @@ ftrace_trace_special(void *__tr,
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
+					  sizeof(*entry), 0, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, pc);
-	entry->ent.type			= TRACE_SPECIAL;
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, 0, 4, pc);
-	ftrace_trace_userstack(tr, 0, pc);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void
@@ -973,12 +995,11 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_CTX,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_CTX;
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
 	entry->prev_state		= prev->state;
@@ -986,9 +1007,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, flags, 5, pc);
-	ftrace_trace_userstack(tr, flags, pc);
+	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1000,12 +1019,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_WAKE;
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
 	entry->prev_state		= curr->state;
@@ -1013,11 +1031,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, flags, 6, pc);
-	ftrace_trace_userstack(tr, flags, pc);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -2825,12 +2839,10 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size);
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, irq_flags, pc);
-	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
 	entry->depth			= depth;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index df627a9..e03f157 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -403,6 +403,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
+struct ring_buffer_event;
+
+struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
+						    unsigned char type,
+						    unsigned long len,
+						    unsigned long flags,
+						    int pc);
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 4e08deb..7a30fc4 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -143,17 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -170,17 +166,13 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 770e52a..48b2196 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -52,14 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	pc = preempt_count();
+	event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		goto out;
 
-	pc = preempt_count();
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_BRANCH;
 
 	/* Strip off the path, only save the file */
 	p = f->file + strlen(f->file);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e720c00..2aa1c9f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -189,16 +189,15 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event);
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 104ddeb..c401b90 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,19 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
+	int pc = preempt_count();
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
+					  sizeof(*entry), 0, pc);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
 	}
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
-	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -335,19 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
+	int pc = preempt_count();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
+					  sizeof(*entry), 0, pc);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
 	}
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
-	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 3b1a292..bfc21f8 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -124,17 +124,13 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -159,17 +155,13 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
-- 
cgit v1.1


From b6f11df26fdc28324cf9c9e3b77f2dc985c1bb13 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 18:02:00 -0200
Subject: trace: Call tracing_reset_online_cpus before tracer->init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

To make it easy for ftrace plugin writers, as this was open coded in
the existing plugins

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 |  8 +++++++-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_branch.c          |  1 -
 kernel/trace/trace_functions.c       | 17 +++--------------
 kernel/trace/trace_functions_graph.c |  1 -
 kernel/trace/trace_hw_branches.c     |  1 -
 kernel/trace/trace_nop.c             |  1 -
 kernel/trace/trace_sched_switch.c    |  8 +-------
 kernel/trace/trace_selftest.c        | 18 +++++++++---------
 kernel/trace/trace_sysprof.c         | 14 ++++----------
 10 files changed, 25 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8fad377..ef4dbac 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2171,6 +2171,12 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+int tracer_init(struct tracer *t, struct trace_array *tr)
+{
+	tracing_reset_online_cpus(tr);
+	return t->init(tr);
+}
+
 static int tracing_set_tracer(const char *buf)
 {
 	struct trace_array *tr = &global_trace;
@@ -2195,7 +2201,7 @@ static int tracing_set_tracer(const char *buf)
 
 	current_trace = t;
 	if (t->init) {
-		ret = t->init(tr);
+		ret = tracer_init(t, tr);
 		if (ret)
 			goto out;
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e03f157..f2742fb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,7 @@ struct trace_iterator {
 	cpumask_var_t		started;
 };
 
+int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 48b2196..f8ae2c5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -131,7 +131,6 @@ static void stop_branch_trace(struct trace_array *tr)
 
 static int branch_trace_init(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
 	start_branch_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index d067cea..36bf956 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -24,32 +24,21 @@ static struct trace_array	*func_trace;
 static void tracing_start_function_trace(void);
 static void tracing_stop_function_trace(void);
 
-static void start_function_trace(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
 {
 	func_trace = tr;
 	tr->cpu = get_cpu();
-	tracing_reset_online_cpus(tr);
 	put_cpu();
 
 	tracing_start_cmdline_record();
 	tracing_start_function_trace();
-}
-
-static void stop_function_trace(struct trace_array *tr)
-{
-	tracing_stop_function_trace();
-	tracing_stop_cmdline_record();
-}
-
-static int function_trace_init(struct trace_array *tr)
-{
-	start_function_trace(tr);
 	return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-	stop_function_trace(tr);
+	tracing_stop_function_trace();
+	tracing_stop_cmdline_record();
 }
 
 static void function_trace_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c97594d..222f97d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -56,7 +56,6 @@ static int graph_trace_init(struct trace_array *tr)
 					&trace_graph_entry);
 	if (ret)
 		return ret;
-	tracing_reset_online_cpus(tr);
 	tracing_start_cmdline_record();
 
 	return 0;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 2aa1c9f..ca4bbcf 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -132,7 +132,6 @@ static int bts_trace_init(struct trace_array *tr)
 	hw_branch_trace = tr;
 
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 087b6cb..9aa84bd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -48,7 +48,6 @@ static void stop_nop_trace(struct trace_array *tr)
 static int nop_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-	tracing_reset_online_cpus(tr);
 	start_nop_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c4f9add..30e14fe 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -185,12 +185,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 	ctx_trace = tr;
 }
 
-static void start_sched_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch_record();
-}
-
 static void stop_sched_trace(struct trace_array *tr)
 {
 	tracing_stop_sched_switch_record();
@@ -199,7 +193,7 @@ static void stop_sched_trace(struct trace_array *tr)
 static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-	start_sched_trace(tr);
+	tracing_start_sched_switch_record();
 	return 0;
 }
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5013812..445700e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -115,7 +115,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -189,7 +189,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -236,7 +236,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -290,7 +290,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -344,7 +344,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	}
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -476,7 +476,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -537,7 +537,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -569,7 +569,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return 0;
@@ -596,7 +596,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad..84ca9d8 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
 		stop_stack_timer(cpu);
 }
 
-static void start_stack_trace(struct trace_array *tr)
-{
-	mutex_lock(&sample_timer_lock);
-	tracing_reset_online_cpus(tr);
-	start_stack_timers();
-	tracer_enabled = 1;
-	mutex_unlock(&sample_timer_lock);
-}
-
 static void stop_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
@@ -247,7 +238,10 @@ static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
-	start_stack_trace(tr);
+	mutex_lock(&sample_timer_lock);
+	start_stack_timers();
+	tracer_enabled = 1;
+	mutex_unlock(&sample_timer_lock);
 	return 0;
 }
 
-- 
cgit v1.1


From 1830b52d0de8c60c4f5dfbac134aa8f69d815801 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 7 Feb 2009 19:38:43 -0500
Subject: trace: remove deprecated entry->cpu

Impact: fix to prevent developers from using entry->cpu

With the new ring buffer infrastructure, the cpu for the entry is
implicit with which CPU buffer it is on.

The original code use to record the current cpu into the generic
entry header, which can be retrieved by entry->cpu. When the
ring buffer was introduced, the users were convert to use the
the cpu number of which cpu ring buffer was in use (this was passed
to the tracers by the iterator: iter->cpu).

Unfortunately, the cpu item in the entry structure was never removed.
This allowed for developers to use it instead of the proper iter->cpu,
unknowingly, using an uninitialized variable. This was not the fault
of the developers, since it would seem like the logical place to
retrieve the cpu identifier.

This patch removes the cpu item from the entry structure and fixes
all the users that should have been using iter->cpu.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace.h             | 1 -
 kernel/trace/trace_hw_branches.c | 3 +--
 kernel/trace/trace_output.c      | 6 +++---
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd51cf0..bd4d9f8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1531,7 +1531,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
 		SEQ_PUT_FIELD_RET(s, entry->pid);
-		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->cpu);
 		SEQ_PUT_FIELD_RET(s, iter->ts);
 	}
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f0c7a0f..5efc4c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -45,7 +45,6 @@ enum trace_type {
  */
 struct trace_entry {
 	unsigned char		type;
-	unsigned char		cpu;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545..549238a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -159,7 +159,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
+		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
 		    seq_print_ip_sym(seq, it->to, symflags) &&
 		    trace_seq_printf(seq, "\t  <-  ") &&
 		    seq_print_ip_sym(seq, it->from, symflags) &&
@@ -195,7 +195,6 @@ void trace_hw_branch(u64 from, u64 to)
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, from);
 	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
 	ring_buffer_unlock_commit(tr->buffer, event, irq2);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b7380ee..463a310 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long secs = (unsigned long)t;
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
-				comm, entry->pid, entry->cpu, secs, usec_rem);
+				comm, entry->pid, iter->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
@@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 		char *comm = trace_find_cmdline(entry->pid);
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
-				       entry->pid, entry->cpu, entry->flags,
+				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
 				       ns2usecs(iter->ts),
 				       abs_usecs / USEC_PER_MSEC,
@@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 				       rel_usecs / USEC_PER_MSEC,
 				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		ret = lat_print_generic(s, entry, entry->cpu);
+		ret = lat_print_generic(s, entry, iter->cpu);
 		if (ret)
 			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
-- 
cgit v1.1


From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 18:43:07 -0500
Subject: ring-buffer: add NMI protection for spinlocks

Impact: prevent deadlock in NMI

The ring buffers are not yet totally lockless with writing to
the buffer. When a writer crosses a page, it grabs a per cpu spinlock
to protect against a reader. The spinlocks taken by a writer are not
to protect against other writers, since a writer can only write to
its own per cpu buffer. The spinlocks protect against readers that
can touch any cpu buffer. The writers are made to be reentrant
with the spinlocks disabling interrupts.

The problem arises when an NMI writes to the buffer, and that write
crosses a page boundary. If it grabs a spinlock, it can be racing
with another writer (since disabling interrupts does not protect
against NMIs) or with a reader on the same CPU. Luckily, most of the
users are not reentrant and protects against this issue. But if a
user of the ring buffer becomes reentrant (which is what the ring
buffers do allow), if the NMI also writes to the ring buffer then
we risk the chance of a deadlock.

This patch moves the ftrace_nmi_enter called by nmi_enter() to the
ring buffer code. It replaces the current ftrace_nmi_enter that is
used by arch specific code to arch_ftrace_nmi_enter and updates
the Kconfig to handle it.

When an NMI is called, it will set a per cpu variable in the ring buffer
code and will clear it when the NMI exits. If a write to the ring buffer
crosses page boundaries inside an NMI, a trylock is used on the spin
lock instead. If the spinlock fails to be acquired, then the entry
is discarded.

This bug appeared in the ftrace work in the RT tree, where event tracing
is reentrant. This workaround solved the deadlocks that appeared there.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Kconfig       |  8 ++++++++
 kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644..25131a5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_FTRACE_NMI_ENTER
+	bool
+
 config HAVE_FUNCTION_TRACER
 	bool
 
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
 	bool
 
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
+
 config TRACING
 	bool
 	select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d737..a60a6a8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -19,6 +20,35 @@
 #include "trace.h"
 
 /*
+ * Since the write to the buffer is still not fully lockless,
+ * we must be careful with NMIs. The locks in the writers
+ * are taken when a write crosses to a new page. The locks
+ * protect against races with the readers (this will soon
+ * be fixed with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we want to
+ * keep traces reentrant, we need to manage what happens
+ * when we are in an NMI.
+ */
+static DEFINE_PER_CPU(int, rb_in_nmi);
+
+void ftrace_nmi_enter(void)
+{
+	__get_cpu_var(rb_in_nmi)++;
+	/* call arch specific handler too */
+	arch_ftrace_nmi_enter();
+}
+
+void ftrace_nmi_exit(void)
+{
+	arch_ftrace_nmi_exit();
+	__get_cpu_var(rb_in_nmi)--;
+	/* NMIs are not recursive */
+	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
+}
+
+
+/*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
  * prevents all ring buffers from being recorded to.
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	bool lock_taken = false;
 
 	commit_page = cpu_buffer->commit_page;
 	/* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		struct buffer_page *next_page = tail_page;
 
 		local_irq_save(flags);
-		__raw_spin_lock(&cpu_buffer->lock);
+		/*
+		 * NMIs can happen after we take the lock.
+		 * If we are in an NMI, only take the lock
+		 * if it is not already taken. Otherwise
+		 * simply fail.
+		 */
+		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+			if (!__raw_spin_trylock(&cpu_buffer->lock))
+				goto out_unlock;
+		} else
+			__raw_spin_lock(&cpu_buffer->lock);
+
+		lock_taken = true;
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
 
-	__raw_spin_unlock(&cpu_buffer->lock);
+	if (likely(lock_taken))
+		__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
 }
-- 
cgit v1.1


From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 01:45:16 -0500
Subject: ring-buffer: use generic version of in_nmi

Impact: clean up

Now that a generic in_nmi is available, this patch removes the
special code in the ring_buffer and implements the in_nmi generic
version instead.

With this change, I was also able to rename the "arch_ftrace_nmi_enter"
back to "ftrace_nmi_enter" and remove the code from the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a60a6a8..5ee3444 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -20,35 +21,6 @@
 #include "trace.h"
 
 /*
- * Since the write to the buffer is still not fully lockless,
- * we must be careful with NMIs. The locks in the writers
- * are taken when a write crosses to a new page. The locks
- * protect against races with the readers (this will soon
- * be fixed with a lockless solution).
- *
- * Because we can not protect against NMIs, and we want to
- * keep traces reentrant, we need to manage what happens
- * when we are in an NMI.
- */
-static DEFINE_PER_CPU(int, rb_in_nmi);
-
-void ftrace_nmi_enter(void)
-{
-	__get_cpu_var(rb_in_nmi)++;
-	/* call arch specific handler too */
-	arch_ftrace_nmi_enter();
-}
-
-void ftrace_nmi_exit(void)
-{
-	arch_ftrace_nmi_exit();
-	__get_cpu_var(rb_in_nmi)--;
-	/* NMIs are not recursive */
-	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
-}
-
-
-/*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
  * prevents all ring buffers from being recorded to.
@@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		local_irq_save(flags);
 		/*
+		 * Since the write to the buffer is still not
+		 * fully lockless, we must be careful with NMIs.
+		 * The locks in the writers are taken when a write
+		 * crosses to a new page. The locks protect against
+		 * races with the readers (this will soon be fixed
+		 * with a lockless solution).
+		 *
+		 * Because we can not protect against NMIs, and we
+		 * want to keep traces reentrant, we need to manage
+		 * what happens when we are in an NMI.
+		 *
 		 * NMIs can happen after we take the lock.
 		 * If we are in an NMI, only take the lock
 		 * if it is not already taken. Otherwise
 		 * simply fail.
 		 */
-		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
 				goto out_unlock;
 		} else
-- 
cgit v1.1


From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Fri, 6 Feb 2009 17:33:27 +0800
Subject: trace: trivial fixes in comment typos.

Impact: clean up

Fixed several typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 6 +++---
 kernel/trace/trace.h  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6861003..1796e01 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	 * it is not enabled then do nothing.
 	 *
 	 * If this record is not to be traced and
-	 * it is enabled then disabled it.
+	 * it is enabled then disable it.
 	 *
 	 */
 	if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/* Record is not filtered and is not enabled do nothing */
+		/* Record is not filtered or enabled, do nothing */
 		if (!fl)
 			return 0;
 
@@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 		} else {
 
-			/* if record is not enabled do nothing */
+			/* if record is not enabled, do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5efc4c7..f92aba5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,12 +616,12 @@ extern struct tracer nop_trace;
  * preempt_enable (after a disable), a schedule might take place
  * causing an infinite recursion.
  *
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
  * disabling preemption. When we want to enable preemption we
  * check the flag, if it is set, then we call preempt_enable_no_resched.
  * Otherwise, we call preempt_enable.
  *
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
  * and we have yet to reschedule, we are either in an atomic location
  * (where we do not need to check for scheduling) or we are inside
  * the scheduler and do not want to resched.
@@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void)
  *
  * This is a scheduler safe way to enable preemption and not miss
  * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
  * are inside the scheduler (we would have already scheduled
  * otherwise). In this case, we do not want to call normal
  * preempt_enable, but preempt_enable_no_resched instead.
-- 
cgit v1.1


From 2db270a80b8f2238e536876cfb3987af02684df8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 20:46:45 +0100
Subject: tracing/blktrace: move the tracing file to kernel/trace

Impact: cleanup

Move blktrace.c to kernel/trace, also move its config entry.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig    |   23 +
 kernel/trace/Makefile   |    1 +
 kernel/trace/blktrace.c | 1538 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1562 insertions(+)
 create mode 100644 kernel/trace/blktrace.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 25131a5..4fee43c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -302,6 +302,29 @@ config WORKQUEUE_TRACER
           For example it can help a developer to decide whether he should
           choose a per cpu workqueue instead of a singlethreaded one.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	depends on SYSFS
+	select RELAY
+	select DEBUG_FS
+	select TRACEPOINTS
+	select TRACING
+	select STACKTRACE
+	help
+	  Say Y here if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the userspace
+	  support tools needed), fetch the blktrace tools from:
+
+	  git://git.kernel.dk/blktrace.git
+
+	  Tracing also is possible using the ftrace interface, e.g.:
+
+	    echo 1 > /sys/block/sda/sda1/trace/enable
+	    echo blk > /sys/kernel/debug/tracing/current_tracer
+	    cat /sys/kernel/debug/tracing/trace_pipe
+
+	  If unsure, say N.
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f76d48f..627090b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 0000000..3b91da0
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/time.h>
+#include <trace/block.h>
+#include <linux/uaccess.h>
+#include "trace_output.h"
+
+static unsigned int blktrace_seq __read_mostly = 1;
+
+static struct trace_array *blk_tr;
+static int __read_mostly  blk_tracer_enabled;
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 	0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+	{ }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+	.val  = 0,
+	.opts = blk_tracer_opts,
+};
+
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
+/*
+ * Send out a notify message.
+ */
+static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+		       const void *data, size_t len)
+{
+	struct blk_io_trace *t;
+
+	if (!bt->rchan)
+		return;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + len);
+	if (t) {
+		const int cpu = smp_processor_id();
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->time = ktime_to_ns(ktime_get());
+		t->device = bt->dev;
+		t->action = action;
+		t->pid = pid;
+		t->cpu = cpu;
+		t->pdu_len = len;
+		memcpy((void *) t + sizeof(*t), data, len);
+	}
+}
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	tsk->btrace_seq = blktrace_seq;
+	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+}
+
+static void trace_note_time(struct blk_trace *bt)
+{
+	struct timespec now;
+	unsigned long flags;
+	u32 words[2];
+
+	getnstimeofday(&now);
+	words[0] = now.tv_sec;
+	words[1] = now.tv_nsec;
+
+	local_irq_save(flags);
+	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+	local_irq_restore(flags);
+}
+
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+	int n;
+	va_list args;
+	unsigned long flags;
+	char *buf;
+
+	if (blk_tr) {
+		va_start(args, fmt);
+		ftrace_vprintk(fmt, args);
+		va_end(args);
+		return;
+	}
+
+	if (!bt->msg_data)
+		return;
+
+	local_irq_save(flags);
+	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+	va_start(args, fmt);
+	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
+	va_end(args);
+
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
+					 BLK_TC_ACT(BLK_TC_WRITE) };
+
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct ring_buffer_event *event = NULL;
+	struct blk_io_trace *t;
+	unsigned long flags = 0;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu, pc = 0;
+
+	if (unlikely(bt->trace_state != Blktrace_running ||
+		     !blk_tracer_enabled))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= MASK_TC_BIT(rw, BARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, META);
+	what |= MASK_TC_BIT(rw, DISCARD);
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+	cpu = raw_smp_processor_id();
+
+	if (blk_tr) {
+		tracing_record_cmdline(current);
+
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + pdu_len,
+						  0, pc);
+		if (!event)
+			return;
+		t = ring_buffer_event_data(event);
+		goto record_it;
+	}
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes.
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = ktime_to_ns(ktime_get());
+record_it:
+		/*
+		 * These two are not needed in ftrace as they are in the
+		 * generic trace_entry, filled by tracing_generic_entry_update,
+		 * but for the trace_event->bin() synthesizer benefit we do it
+		 * here too.
+		 */
+		t->cpu = cpu;
+		t->pid = pid;
+
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->device = bt->dev;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+		if (blk_tr) {
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
+			return;
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+static struct dentry *blk_tree_root;
+static DEFINE_MUTEX(blk_tree_mutex);
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	debugfs_remove(bt->msg_file);
+	debugfs_remove(bt->dropped_file);
+	relay_close(bt->rchan);
+	free_percpu(bt->sequence);
+	free_percpu(bt->msg_data);
+	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+int blk_trace_remove(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_remove);
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static const struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	char *msg;
+	struct blk_trace *bt;
+
+	if (count > BLK_TN_MAX_MSG)
+		return -EINVAL;
+
+	msg = kmalloc(count, GFP_KERNEL);
+	if (msg == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(msg, buffer, count)) {
+		kfree(msg);
+		return -EFAULT;
+	}
+
+	bt = filp->private_data;
+	__trace_note_message(bt, "%s", msg);
+	kfree(msg);
+
+	return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_msg_open,
+	.write =	blk_msg_write,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	debugfs_remove(dentry);
+
+	/*
+	* this will fail for all but the last file, but that is ok. what we
+	* care about is the top level buts->name directory going away, when
+	* the last trace file is gone. Then we don't have to rmdir() that
+	* manually on trace stop, so it nicely solves the issue with
+	* force killing of running traces.
+	*/
+
+	debugfs_remove(parent);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			struct blk_user_trace_setup *buts)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	int ret, i;
+
+	if (!buts->buf_size || !buts->buf_nr)
+		return -EINVAL;
+
+	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
+	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts->name); i++)
+		if (buts->name[i] == '/')
+			buts->name[i] = '_';
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	if (!bt->msg_data)
+		goto err;
+
+	ret = -ENOENT;
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			return -ENOMEM;
+	}
+
+	dir = debugfs_create_dir(buts->name, blk_tree_root);
+
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
+					       &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	if (!bt->msg_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts->buf_size,
+				buts->buf_nr, &blk_relay_callbacks, bt);
+	if (!bt->rchan)
+		goto err;
+
+	bt->act_mask = buts->act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts->start_lba;
+	bt->end_lba = buts->end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts->pid;
+	bt->trace_state = Blktrace_setup;
+
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1) {
+		ret = blk_register_tracepoints();
+		if (ret)
+			goto probe_err;
+	}
+	mutex_unlock(&blk_probe_mutex);
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+probe_err:
+	atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+err:
+	if (bt) {
+		if (bt->msg_file)
+			debugfs_remove(bt->msg_file);
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		free_percpu(bt->sequence);
+		free_percpu(bt->msg_data);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	int ret;
+
+	ret = copy_from_user(&buts, arg, sizeof(buts));
+	if (ret)
+		return -EFAULT;
+
+	ret = do_blk_trace_setup(q, name, dev, &buts);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_setup);
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+
+			trace_note_time(bt);
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_trace_startstop);
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	struct request_queue *q;
+	int ret, start = 0;
+	char b[BDEVNAME_SIZE];
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		bdevname(bdev, b);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(struct request_queue *q)
+{
+	if (q->blk_trace) {
+		blk_trace_startstop(q, 0);
+		blk_trace_remove(q);
+	}
+}
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+				sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				rw, what, rq->errors, 0, NULL);
+	}
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q,
+				     struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q,
+				      struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q,
+					struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+					 struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q,
+				struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+	}
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q,
+				  struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+					0, 0, NULL);
+	}
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt)
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+				unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+			 struct request *rq,
+			 void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	WARN_ON(ret);
+	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	WARN_ON(ret);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	WARN_ON(ret);
+	ret = register_trace_block_plug(blk_add_trace_plug);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	WARN_ON(ret);
+	ret = register_trace_block_split(blk_add_trace_split);
+	WARN_ON(ret);
+	ret = register_trace_block_remap(blk_add_trace_remap);
+	WARN_ON(ret);
+	return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+	unregister_trace_block_remap(blk_add_trace_remap);
+	unregister_trace_block_split(blk_add_trace_split);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	unregister_trace_block_plug(blk_add_trace_plug);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+	unregister_trace_block_getrq(blk_add_trace_getrq);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+	tracepoint_synchronize_unregister();
+}
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+	int i = 0;
+
+	if (t->action & BLK_TC_DISCARD)
+		rwbs[i++] = 'D';
+	else if (t->action & BLK_TC_WRITE)
+		rwbs[i++] = 'W';
+	else if (t->bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (t->action & BLK_TC_AHEAD)
+		rwbs[i++] = 'A';
+	if (t->action & BLK_TC_BARRIER)
+		rwbs[i++] = 'B';
+	if (t->action & BLK_TC_SYNC)
+		rwbs[i++] = 'S';
+	if (t->action & BLK_TC_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+	return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+	const __u64 *val = pdu_start(ent);
+	return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+			  struct blk_io_trace_remap *r)
+{
+	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	__u64 sector = __r->sector;
+
+	r->device = be32_to_cpu(__r->device);
+	r->device_from = be32_to_cpu(__r->device_from);
+	r->sector = be64_to_cpu(sector);
+}
+
+static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+{
+	char rwbs[6];
+	unsigned long long ts  = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned secs	       = (unsigned long)ts;
+	const struct trace_entry *ent = iter->ent;
+	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+
+	fill_rwbs(rwbs, t);
+
+	return trace_seq_printf(&iter->seq,
+				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), iter->cpu,
+				secs, usec_rem, ent->pid, act, rwbs);
+}
+
+static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
+			      const char *act)
+{
+	char rwbs[6];
+	fill_rwbs(rwbs, t);
+	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *cmd = trace_find_cmdline(ent->pid);
+
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%s]\n",
+					t_sector(ent), t_sec(ent), cmd);
+	return trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static int blk_log_with_error(struct trace_seq *s,
+			      const struct trace_entry *ent)
+{
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
+					t_sec(ent), t_error(ent));
+	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+}
+
+static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+	struct blk_io_trace_remap r = { .device = 0, };
+
+	get_pdu_remap(ent, &r);
+	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+			       t_sector(ent),
+			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
+			       (unsigned long long)r.sector);
+}
+
+static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+}
+
+static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
+				get_pdu_int(ent));
+}
+
+static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return;
+	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
+		    "#  |     |     |           |   |   |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		if (blk_register_tracepoints())
+			atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+	blk_tr = tr;
+	blk_tracer_start(tr);
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled++;
+	mutex_unlock(&blk_probe_mutex);
+	return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+	trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+	if (!atomic_read(&blk_probes_ref))
+		return;
+
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled--;
+	WARN_ON(blk_tracer_enabled < 0);
+	mutex_unlock(&blk_probe_mutex);
+
+	blk_tracer_stop(tr);
+}
+
+static struct {
+	const char *act[2];
+	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] __read_mostly = {
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
+	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
+	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
+	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
+	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
+	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
+	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
+	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
+	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
+	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
+	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
+	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
+	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
+};
+
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	int ret;
+
+	if (!trace_print_context(iter))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(s, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+	const int offset = offsetof(struct blk_io_trace, sector);
+	struct blk_io_trace old = {
+		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
+		.time     = ns2usecs(iter->ts),
+	};
+
+	if (!trace_seq_putmem(s, &old, offset))
+		return 0;
+	return trace_seq_putmem(s, &t->sector,
+				sizeof(old) - offset + t->pdu_len);
+}
+
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+{
+	return blk_trace_synthesize_old_trace(iter) ?
+			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+	const struct blk_io_trace *t;
+	u16 what;
+	int ret;
+
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return TRACE_TYPE_UNHANDLED;
+
+	t = (const struct blk_io_trace *)iter->ent;
+	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(&iter->seq, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+	.name		= "blk",
+	.init		= blk_tracer_init,
+	.reset		= blk_tracer_reset,
+	.start		= blk_tracer_start,
+	.stop		= blk_tracer_stop,
+	.print_header	= blk_tracer_print_header,
+	.print_line	= blk_tracer_print_line,
+	.flags		= &blk_tracer_flags,
+};
+
+static struct trace_event trace_blk_event = {
+	.type	 	= TRACE_BLK,
+	.trace		= blk_trace_event_print,
+	.latency_trace	= blk_trace_event_print,
+	.binary		= blk_trace_event_print_binary,
+};
+
+static int __init init_blk_tracer(void)
+{
+	if (!register_ftrace_event(&trace_blk_event)) {
+		pr_warning("Warning: could not register block events\n");
+		return 1;
+	}
+
+	if (register_tracer(&blk_tracer) != 0) {
+		pr_warning("Warning: could not register the block tracer\n");
+		unregister_ftrace_event(&trace_blk_event);
+		return 1;
+	}
+
+	return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (bt == NULL)
+		return -EINVAL;
+
+	kfree(bt);
+	return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	int ret;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->dev = dev;
+	bt->act_mask = (u16)-1;
+	bt->end_lba = -1ULL;
+	bt->trace_state = Blktrace_running;
+
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt != NULL) {
+		(void)xchg(&q->blk_trace, old_bt);
+		kfree(bt);
+		ret = -EBUSY;
+	}
+	return 0;
+err:
+	return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev != NULL) {
+		struct request_queue *q = bdev_get_queue(bdev);
+
+		if (q != NULL) {
+			mutex_lock(&bdev->bd_mutex);
+			ret = sprintf(buf, "%u\n", !!q->blk_trace);
+			mutex_unlock(&bdev->bd_mutex);
+		}
+
+		bdput(bdev);
+	}
+
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	int value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0 || sscanf(buf, "%d", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	if (value)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+	else
+		ret = blk_trace_remove_queue(q);
+	mutex_unlock(&bdev->bd_mutex);
+
+	if (ret == 0)
+		ret = count;
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+		    sysfs_blk_trace_attr_show, \
+		    sysfs_blk_trace_attr_store)
+
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_act_mask.attr,
+	&dev_attr_pid.attr,
+	&dev_attr_start_lba.attr,
+	&dev_attr_end_lba.attr,
+	NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+	.name  = "trace",
+	.attrs = blk_trace_attrs,
+};
+
+static int blk_str2act_mask(const char *str)
+{
+	int mask = 0;
+	char *copy = kstrdup(str, GFP_KERNEL), *s;
+
+	if (copy == NULL)
+		return -ENOMEM;
+
+	s = strstrip(copy);
+
+	while (1) {
+		char *sep = strchr(s, ',');
+
+		if (sep != NULL)
+			*sep = '\0';
+
+		if (strcasecmp(s, "barrier") == 0)
+			mask |= BLK_TC_BARRIER;
+		else if (strcasecmp(s, "complete") == 0)
+			mask |= BLK_TC_COMPLETE;
+		else if (strcasecmp(s, "fs") == 0)
+			mask |= BLK_TC_FS;
+		else if (strcasecmp(s, "issue") == 0)
+			mask |= BLK_TC_ISSUE;
+		else if (strcasecmp(s, "pc") == 0)
+			mask |= BLK_TC_PC;
+		else if (strcasecmp(s, "queue") == 0)
+			mask |= BLK_TC_QUEUE;
+		else if (strcasecmp(s, "read") == 0)
+			mask |= BLK_TC_READ;
+		else if (strcasecmp(s, "requeue") == 0)
+			mask |= BLK_TC_REQUEUE;
+		else if (strcasecmp(s, "sync") == 0)
+			mask |= BLK_TC_SYNC;
+		else if (strcasecmp(s, "write") == 0)
+			mask |= BLK_TC_WRITE;
+
+		if (sep == NULL)
+			break;
+
+		s = sep + 1;
+	}
+	kfree(copy);
+
+	return mask;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct request_queue *q;
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+	mutex_lock(&bdev->bd_mutex);
+	if (q->blk_trace == NULL)
+		ret = sprintf(buf, "disabled\n");
+	else if (attr == &dev_attr_act_mask)
+		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+	else if (attr == &dev_attr_pid)
+		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+	else if (attr == &dev_attr_start_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+	else if (attr == &dev_attr_end_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	u64 value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0)
+		goto out;
+
+	if (attr == &dev_attr_act_mask) {
+		if (sscanf(buf, "%llx", &value) != 1) {
+			/* Assume it is a list of trace category names */
+			value = blk_str2act_mask(buf);
+			if (value < 0)
+				goto out;
+		}
+	} else if (sscanf(buf, "%llu", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = 0;
+	if (q->blk_trace == NULL)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+
+	if (ret == 0) {
+		if (attr == &dev_attr_act_mask)
+			q->blk_trace->act_mask = value;
+		else if (attr == &dev_attr_pid)
+			q->blk_trace->pid = value;
+		else if (attr == &dev_attr_start_lba)
+			q->blk_trace->start_lba = value;
+		else if (attr == &dev_attr_end_lba)
+			q->blk_trace->end_lba = value;
+		ret = count;
+	}
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
-- 
cgit v1.1


From 7447dce96f2233d250bc39a4a10a42f7c3dd46fc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 21:33:57 +0100
Subject: tracing/function-graph-tracer: provide a selftest for the function
 graph tracer

Making it more easy to do a basic regression test for this tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                 |  2 ++
 kernel/trace/trace_functions_graph.c |  3 +++
 kernel/trace/trace_selftest.c        | 50 ++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b9838f4..a011ec0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -500,6 +500,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
 					   struct trace_array *tr);
+extern int trace_selftest_startup_function_graph(struct tracer *trace,
+						 struct trace_array *tr);
 extern int trace_selftest_startup_irqsoff(struct tracer *trace,
 					  struct trace_array *tr);
 extern int trace_selftest_startup_preemptoff(struct tracer *trace,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 222f97d..88f8d9d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -750,6 +750,9 @@ static struct tracer graph_trace __read_mostly = {
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_function_graph,
+#endif
 };
 
 static __init int init_graph_trace(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 445700e..0c9aa14 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,8 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
+	case TRACE_GRAPH_ENT:
+	case TRACE_GRAPH_RET:
 		return 1;
 	}
 	return 0;
@@ -227,6 +229,54 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_FUNCTION_TRACER */
 
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * Pretty much the same than for the function tracer from which the selftest
+ * has been borrowed.
+ */
+int
+trace_selftest_startup_function_graph(struct tracer *trace,
+					struct trace_array *tr)
+{
+	int ret;
+	unsigned long count;
+
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
+
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	tracing_stop();
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+
+	trace->reset(tr);
+	tracing_start();
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* Don't test dynamic tracing, the function tracer already did */
+
+out:
+	/* Stop it if we failed */
+	if (ret)
+		ftrace_graph_stop();
+
+	return ret;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+
 #ifdef CONFIG_IRQSOFF_TRACER
 int
 trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
-- 
cgit v1.1


From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 22:16:12 +0100
Subject: tracing/power: move the power trace headers to a dedicated file

Impact: cleanup

Move the power tracer headers to trace/power.h to keep ftrace.h and power bits
more easy to maintain as separated topics.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h       | 1 +
 kernel/trace/trace_power.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a011ec0..1ecfb9d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,6 +10,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <trace/kmemtrace.h>
+#include <trace/power.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bfc21f8..b1d0d08 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,7 +11,7 @@
 
 #include <linux/init.h>
 #include <linux/debugfs.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
 
-- 
cgit v1.1


From 3861a17bcc0af815f684c6178bc9ec2d790c350e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 8 Feb 2009 00:04:02 +0100
Subject: tracing/function-graph-tracer: drop the kernel_text_address check

When the function graph tracer picks a return address, it ensures this address
is really a kernel text one by calling __kernel_text_address()

Actually this path has never been taken.Its role was more likely to debug the tracer
on the beginning of its development but this function is wasteful since it is called
for every traced function.

The fault check is already sufficient.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 4 ++--
 kernel/module.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8..0df6253 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-__notrace_funcgraph int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
@@ -54,7 +54,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-__notrace_funcgraph int __kernel_text_address(unsigned long addr)
+int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
diff --git a/kernel/module.c b/kernel/module.c
index ba22484..22d7379 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2735,7 +2735,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
-- 
cgit v1.1


From b5db03c4355e568f1567758287b30a6a262d5057 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 7 Feb 2009 18:52:59 -0200
Subject: tracing: handle unregistering the current tracer

Impact: simplification

Instead of requiring that plugins have the sequence:

  my_tracer_stop(my_trace_array);
  unregister_tracer(my_tracer);

it should be possible just do a:

  unregister_tracer(my_tracer);

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 03fbd4c..93040f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -559,6 +559,15 @@ void unregister_tracer(struct tracer *type)
 
  found:
 	*t = (*t)->next;
+
+	if (type == current_trace && tracer_enabled) {
+		tracer_enabled = 0;
+		tracing_stop();
+		if (current_trace->stop)
+			current_trace->stop(&global_trace);
+		current_trace = &nop_trace;
+	}
+
 	if (strlen(type->name) != max_tracer_type_len)
 		goto out;
 
-- 
cgit v1.1


From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 6 Feb 2009 15:37:47 -0800
Subject: softlockup: remove timestamp checking from hung_task

Impact: saves sizeof(long) bytes per task_struct

By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between
tasklist scans we can avoid using timestamps.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c      |  8 +++-----
 kernel/hung_task.c | 48 +++++++++---------------------------------------
 2 files changed, 12 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index fb94442..bf582f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
 
 	tsk->mm = NULL;
 	tsk->active_mm = NULL;
@@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_HUNG_TASK
-	p->last_switch_count = 0;
-	p->last_switch_timestamp = 0;
-#endif
-
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3951a80..0c924de 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * Zero means infinite timeout - no checking done:
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
-static unsigned long __read_mostly hung_task_poll_jiffies;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
@@ -69,33 +68,17 @@ static struct notifier_block panic_block = {
 	.notifier_call = hung_task_panic,
 };
 
-/*
- * Returns seconds, approximately.  We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(void)
-{
-	int this_cpu = raw_smp_processor_id();
-
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
-}
-
-static void check_hung_task(struct task_struct *t, unsigned long now,
-			    unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
 	if (t->flags & PF_FROZEN)
 		return;
 
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+	if (switch_count != t->last_switch_count) {
 		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) < timeout)
-		return;
 	if (!sysctl_hung_task_warnings)
 		return;
 	sysctl_hung_task_warnings--;
@@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 	sched_show_task(t);
 	__debug_show_held_locks(t);
 
-	t->last_switch_timestamp = now;
 	touch_nmi_watchdog();
 
 	if (sysctl_hung_task_panic)
@@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	int batch_count = HUNG_TASK_BATCHING;
-	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
 	/*
@@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now, timeout);
+			check_hung_task(t, timeout);
 	} while_each_thread(g, t);
  unlock:
 	rcu_read_unlock();
 }
 
-static void update_poll_jiffies(void)
+static unsigned long timeout_jiffies(unsigned long timeout)
 {
 	/* timeout of 0 will disable the watchdog */
-	if (sysctl_hung_task_timeout_secs == 0)
-		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
-	else
-		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+	return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
 }
 
 /*
@@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 	if (ret || !write)
 		goto out;
 
-	update_poll_jiffies();
-
 	wake_up_process(watchdog_task);
 
  out:
@@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 static int watchdog(void *dummy)
 {
 	set_user_nice(current, 0);
-	update_poll_jiffies();
 
 	for ( ; ; ) {
-		unsigned long timeout;
+		unsigned long timeout = sysctl_hung_task_timeout_secs;
 
-		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+			timeout = sysctl_hung_task_timeout_secs;
 
-		/*
-		 * Need to cache timeout here to avoid timeout being set
-		 * to 0 via sysctl while inside check_hung_*_tasks().
-		 */
-		timeout = sysctl_hung_task_timeout_secs;
-		if (timeout)
-			check_hung_uninterruptible_tasks(timeout);
+		check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.1


From 1dfba05d0f1a9b4245bb242a7c17fe448811a520 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Feb 2009 12:06:54 +0100
Subject: tracing/blktrace: move the tracing file to kernel/trace, fix

Impact: build fix

The BLK_DEV_IO_TRACE entry used to be in block/Kconfig - which
file itself was dependent on CONFIG_BLOCK. But now the entry is
in kernel/trace/Kconfig - which is present even on !CONFIG_BLOCK.

So add a 'depends on BLOCK' to BLK_DEV_IO_TRACE.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4fee43c..3a33128 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -305,6 +305,7 @@ config WORKQUEUE_TRACER
 config BLK_DEV_IO_TRACE
 	bool "Support for tracing block io actions"
 	depends on SYSFS
+	depends on BLOCK
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-- 
cgit v1.1


From b91facc367366b3f71375f337eb5997ec9ab4e69 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Feb 2009 18:30:44 +0100
Subject: tracing/function-graph-tracer: handle the leaf functions from
 trace_pipe

When one cats the trace file, the leaf functions are printed without brackets:

 function();

whereas in the trace_pipe file we'll see the following:

 function() {
 }

This is because the ring_buffer handling is not the same between those two files.
On the trace file, when an entry is printed, the iterator advanced and then we can
check the next entry.

There is no iterator with trace_pipe, the current entry to print has been peeked
and not consumed. So checking the next entry will still return the current one while
we don't consume it.

This patch introduces a new value for the output callbacks to ask the tracing
core to not consume the current entry after printing it.

We need it because we will have to consume the current entry ourself to check
the next one.

Now the trace_pipe is able to handle well the leaf functions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 |  4 +--
 kernel/trace/trace.h                 |  7 +++---
 kernel/trace/trace_functions_graph.c | 48 ++++++++++++++++++++++--------------
 3 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 93040f1..5b1e9a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2468,8 +2468,8 @@ waitagain:
 			iter->seq.len = len;
 			break;
 		}
-
-		trace_consume(iter);
+		if (ret != TRACE_TYPE_NO_CONSUME)
+			trace_consume(iter);
 
 		if (iter->seq.len >= cnt)
 			break;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ecfb9d..7b0518a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -63,13 +63,13 @@ struct ftrace_entry {
 
 /* Function call entry */
 struct ftrace_graph_ent_entry {
-	struct trace_entry			ent;
+	struct trace_entry		ent;
 	struct ftrace_graph_ent		graph_ent;
 };
 
 /* Function return entry */
 struct ftrace_graph_ret_entry {
-	struct trace_entry			ent;
+	struct trace_entry		ent;
 	struct ftrace_graph_ret		ret;
 };
 extern struct tracer boot_tracer;
@@ -309,7 +309,8 @@ extern void __ftrace_bad_type(void);
 enum print_line_t {
 	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
 	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
 };
 
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 88f8d9d..782ec0f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -212,8 +212,8 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 	return ret;
 }
 
-static bool
-trace_branch_is_leaf(struct trace_iterator *iter,
+static struct ftrace_graph_ret_entry *
+get_return_for_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *curr)
 {
 	struct ring_buffer_iter *ring_iter;
@@ -222,24 +222,33 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 
 	ring_iter = iter->buffer_iter[iter->cpu];
 
-	if (!ring_iter)
-		return false;
-
-	event = ring_buffer_iter_peek(ring_iter, NULL);
+	/* First peek to compare current entry and the next one */
+	if (ring_iter)
+		event = ring_buffer_iter_peek(ring_iter, NULL);
+	else {
+	/* We need to consume the current entry to see the next one */
+		ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+		event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+					NULL);
+	}
 
 	if (!event)
-		return false;
+		return NULL;
 
 	next = ring_buffer_event_data(event);
 
 	if (next->ent.type != TRACE_GRAPH_RET)
-		return false;
+		return NULL;
 
 	if (curr->ent.pid != next->ent.pid ||
 			curr->graph_ent.func != next->ret.func)
-		return false;
+		return NULL;
 
-	return true;
+	/* this is a leaf, now advance the iterator */
+	if (ring_iter)
+		ring_buffer_read(ring_iter, NULL);
+
+	return next;
 }
 
 /* Signal a overhead of time execution to the output */
@@ -376,18 +385,15 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 /* Case of a leaf function on its call entry */
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
-		struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
+		struct ftrace_graph_ent_entry *entry,
+		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
 {
-	struct ftrace_graph_ret_entry *ret_entry;
 	struct ftrace_graph_ret *graph_ret;
-	struct ring_buffer_event *event;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
 	int ret;
 	int i;
 
-	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-	ret_entry = ring_buffer_event_data(event);
 	graph_ret = &ret_entry->ret;
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
@@ -457,7 +463,11 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return TRACE_TYPE_HANDLED;
+	/*
+	 * we already consumed the current entry to check the next one
+	 * and see if this is a leaf.
+	 */
+	return TRACE_TYPE_NO_CONSUME;
 }
 
 static enum print_line_t
@@ -469,6 +479,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
 	struct ftrace_graph_ent *call = &field->graph_ent;
+	struct ftrace_graph_ret_entry *leaf_ret;
 
 	/* Pid */
 	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
@@ -504,8 +515,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	if (trace_branch_is_leaf(iter, field))
-		return print_graph_entry_leaf(iter, field, s);
+	leaf_ret = get_return_for_leaf(iter, field);
+	if (leaf_ret)
+		return print_graph_entry_leaf(iter, field, leaf_ret, s);
 	else
 		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
 
-- 
cgit v1.1


From 3c56819b14b00dd449bd776303e61f8532fad09f Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 9 Feb 2009 08:15:56 +0200
Subject: tracing: splice support for tracing_pipe

Added and implemented tracing_pipe_fops->splice_read(). This allows
userspace programs to get tracing data more efficiently.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h |   6 +++
 2 files changed, 142 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b1e9a9..9e29fdb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/kprobes.h>
 #include <linux/writeback.h>
+#include <linux/splice.h>
 
 #include <linux/stacktrace.h>
 #include <linux/ring_buffer.h>
@@ -364,6 +365,25 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	return cnt;
 }
 
+ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+{
+	int len;
+	void *ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = memcpy(buf, s->buffer + s->readpos, cnt);
+	if (!ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
+}
+
 static void
 trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
@@ -2493,6 +2513,121 @@ out:
 	return sret;
 }
 
+static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	__free_page(buf->page);
+}
+
+static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
+				     unsigned int idx)
+{
+	__free_page(spd->pages[idx]);
+}
+
+static struct pipe_buf_operations tracing_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = tracing_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t tracing_splice_read_pipe(struct file *filp,
+					loff_t *ppos,
+					struct pipe_inode_info *pipe,
+					size_t len,
+					unsigned int flags)
+{
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct trace_iterator *iter = filp->private_data;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.nr_pages = 0, /* This gets updated below. */
+		.flags = flags,
+		.ops = &tracing_pipe_buf_ops,
+		.spd_release = tracing_spd_release_pipe,
+	};
+	ssize_t ret;
+	size_t count, rem;
+	unsigned int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (iter->trace->splice_read) {
+		ret = iter->trace->splice_read(iter, filp,
+					       ppos, pipe, len, flags);
+		if (ret)
+			goto out;
+	}
+
+	ret = tracing_wait_pipe(filp);
+	if (ret <= 0)
+		goto out;
+
+	if (!iter->ent && !find_next_entry_inc(iter)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Fill as many pages as possible. */
+	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+
+		/* Seq buffer is page-sized, exactly what we need. */
+		for (;;) {
+			count = iter->seq.len;
+			ret = print_trace_line(iter);
+			count = iter->seq.len - count;
+			if (rem < count) {
+				rem = 0;
+				iter->seq.len -= count;
+				break;
+			}
+			if (ret == TRACE_TYPE_PARTIAL_LINE) {
+				iter->seq.len -= count;
+				break;
+			}
+
+			trace_consume(iter);
+			rem -= count;
+			if (!find_next_entry_inc(iter))	{
+				rem = 0;
+				iter->ent = NULL;
+				break;
+			}
+		}
+
+		/* Copy the data into the page, so we can start over. */
+		ret = trace_seq_to_buffer(&iter->seq,
+					  page_address(pages[i]),
+					  iter->seq.len);
+		if (ret < 0) {
+			__free_page(pages[i]);
+			break;
+		}
+		partial[i].offset = 0;
+		partial[i].len = iter->seq.len;
+
+		trace_seq_reset(&iter->seq);
+	}
+
+	mutex_unlock(&trace_types_lock);
+
+	spd.nr_pages = i;
+
+	return splice_to_pipe(pipe, &spd);
+
+out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -2656,6 +2791,7 @@ static struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
 	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
+	.splice_read	= tracing_splice_read_pipe,
 	.release	= tracing_release_pipe,
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7b0518a..dbff020 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -353,6 +353,12 @@ struct tracer {
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
+	ssize_t			(*splice_read)(struct trace_iterator *iter,
+					       struct file *filp,
+					       loff_t *ppos,
+					       struct pipe_inode_info *pipe,
+					       size_t len,
+					       unsigned int flags);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
-- 
cgit v1.1


From ff98781bab2735e6c89793034173e0cb5007a7e5 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 9 Feb 2009 08:15:55 +0200
Subject: tracing: Move pipe waiting code out of tracing_read_pipe().

This moves the pipe waiting code from tracing_read_pipe() into
tracing_wait_pipe(), which is useful to implement other fops, like
splice_read.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 69 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e29fdb..11fde0a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2388,37 +2388,15 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
-/*
- * Consumer reader.
- */
-static ssize_t
-tracing_read_pipe(struct file *filp, char __user *ubuf,
-		  size_t cnt, loff_t *ppos)
+/* Must be called with trace_types_lock mutex held. */
+static int tracing_wait_pipe(struct file *filp)
 {
 	struct trace_iterator *iter = filp->private_data;
-	ssize_t sret;
-
-	/* return any leftover data */
-	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (sret != -EBUSY)
-		return sret;
-
-	trace_seq_reset(&iter->seq);
 
-	mutex_lock(&trace_types_lock);
-	if (iter->trace->read) {
-		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
-		if (sret)
-			goto out;
-	}
-
-waitagain:
-	sret = 0;
 	while (trace_empty(iter)) {
 
 		if ((filp->f_flags & O_NONBLOCK)) {
-			sret = -EAGAIN;
-			goto out;
+			return -EAGAIN;
 		}
 
 		/*
@@ -2443,12 +2421,11 @@ waitagain:
 		iter->tr->waiter = NULL;
 
 		if (signal_pending(current)) {
-			sret = -EINTR;
-			goto out;
+			return -EINTR;
 		}
 
 		if (iter->trace != current_trace)
-			goto out;
+			return 0;
 
 		/*
 		 * We block until we read something and tracing is disabled.
@@ -2465,9 +2442,43 @@ waitagain:
 		continue;
 	}
 
+	return 1;
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	ssize_t sret;
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+
+	trace_seq_reset(&iter->seq);
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
+			goto out;
+	}
+
+waitagain:
+	sret = tracing_wait_pipe(filp);
+	if (sret <= 0)
+		goto out;
+
 	/* stop when tracing is finished */
-	if (trace_empty(iter))
+	if (trace_empty(iter)) {
+		sret = 0;
 		goto out;
+	}
 
 	if (cnt >= PAGE_SIZE)
 		cnt = PAGE_SIZE - 1;
-- 
cgit v1.1


From 34cd4998d38f9bd04f34b78a7cb0c7f1bee00bd9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Feb 2009 12:06:29 -0500
Subject: tracing: clean up splice code

Ingo Molnar suggested a series of clean ups for the splice code.
This patch implements those suggestions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 96 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 11fde0a..d898212 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2537,15 +2537,49 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
 }
 
 static struct pipe_buf_operations tracing_pipe_buf_ops = {
-	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
-	.confirm = generic_pipe_buf_confirm,
-	.release = tracing_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
-	.get = generic_pipe_buf_get,
+	.can_merge		= 0,
+	.map			= generic_pipe_buf_map,
+	.unmap			= generic_pipe_buf_unmap,
+	.confirm		= generic_pipe_buf_confirm,
+	.release		= tracing_pipe_buf_release,
+	.steal			= generic_pipe_buf_steal,
+	.get			= generic_pipe_buf_get,
 };
 
+static size_t
+tracing_fill_pipe_page(struct page *pages, size_t rem,
+			struct trace_iterator *iter)
+{
+	size_t count;
+	int ret;
+
+	/* Seq buffer is page-sized, exactly what we need. */
+	for (;;) {
+		count = iter->seq.len;
+		ret = print_trace_line(iter);
+		count = iter->seq.len - count;
+		if (rem < count) {
+			rem = 0;
+			iter->seq.len -= count;
+			break;
+		}
+		if (ret == TRACE_TYPE_PARTIAL_LINE) {
+			iter->seq.len -= count;
+			break;
+		}
+
+		trace_consume(iter);
+		rem -= count;
+		if (!find_next_entry_inc(iter))	{
+			rem = 0;
+			iter->ent = NULL;
+			break;
+		}
+	}
+
+	return rem;
+}
+
 static ssize_t tracing_splice_read_pipe(struct file *filp,
 					loff_t *ppos,
 					struct pipe_inode_info *pipe,
@@ -2556,15 +2590,15 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	struct partial_page partial[PIPE_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages = pages,
-		.partial = partial,
-		.nr_pages = 0, /* This gets updated below. */
-		.flags = flags,
-		.ops = &tracing_pipe_buf_ops,
-		.spd_release = tracing_spd_release_pipe,
+		.pages		= pages,
+		.partial	= partial,
+		.nr_pages	= 0, /* This gets updated below. */
+		.flags		= flags,
+		.ops		= &tracing_pipe_buf_ops,
+		.spd_release	= tracing_spd_release_pipe,
 	};
 	ssize_t ret;
-	size_t count, rem;
+	size_t rem;
 	unsigned int i;
 
 	mutex_lock(&trace_types_lock);
@@ -2573,45 +2607,25 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		ret = iter->trace->splice_read(iter, filp,
 					       ppos, pipe, len, flags);
 		if (ret)
-			goto out;
+			goto out_err;
 	}
 
 	ret = tracing_wait_pipe(filp);
 	if (ret <= 0)
-		goto out;
+		goto out_err;
 
 	if (!iter->ent && !find_next_entry_inc(iter)) {
 		ret = -EFAULT;
-		goto out;
+		goto out_err;
 	}
 
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			break;
 
-		/* Seq buffer is page-sized, exactly what we need. */
-		for (;;) {
-			count = iter->seq.len;
-			ret = print_trace_line(iter);
-			count = iter->seq.len - count;
-			if (rem < count) {
-				rem = 0;
-				iter->seq.len -= count;
-				break;
-			}
-			if (ret == TRACE_TYPE_PARTIAL_LINE) {
-				iter->seq.len -= count;
-				break;
-			}
-
-			trace_consume(iter);
-			rem -= count;
-			if (!find_next_entry_inc(iter))	{
-				rem = 0;
-				iter->ent = NULL;
-				break;
-			}
-		}
+		rem = tracing_fill_pipe_page(pages[i], rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
@@ -2633,7 +2647,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	return splice_to_pipe(pipe, &spd);
 
-out:
+out_err:
 	mutex_unlock(&trace_types_lock);
 
 	return ret;
-- 
cgit v1.1


From b85fa01ed958ca59523a2db3c2ee647b98745d6a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Feb 2009 14:21:14 +0800
Subject: ring_buffer: fix typing mistake

Impact: Fix bug

I found several very very curious line.
It's so curious that it may be brought by typing mistake.

When (cpu_buffer->reader_page == cpu_buffer->commit_page):

1) We haven't copied it for bpage is changed:
   bpage = cpu_buffer->reader_page->page;
   memcpy(bpage->data, cpu_buffer->reader_page->page->data + read ... )
2) We need update cpu_buffer->reader_page->read, but
   "cpu_buffer->reader_page += read;" is not right.

[
  This bug was a typo. The commit->reader_page is a page pointer
  and not an index into the page. The line should have been
  commit->reader_page->read += read.  The other changes
  by Lai are nice clean ups to the code.  - SDR
]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53ba3a6..eca2827 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2406,7 +2406,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * to swap with a page in the ring buffer.
  *
  * for example:
- *	rpage = ring_buffer_alloc_page(buffer);
+ *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
  *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
@@ -2461,18 +2461,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	 */
 	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		unsigned int read = cpu_buffer->reader_page->read;
+		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
 
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		bpage = cpu_buffer->reader_page->page;
 		memcpy(bpage->data,
 		       cpu_buffer->reader_page->page->data + read,
-		       local_read(&bpage->commit) - read);
+		       commit - read);
 
 		/* consume what was read */
-		cpu_buffer->reader_page += read;
-
+		cpu_buffer->reader_page->read = commit;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.1


From 667d24125839b6f3363d8177d7ed9fab8a40e45f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Feb 2009 14:21:17 +0800
Subject: ring_buffer: fix ring_buffer_read_page()

Impact: change API and init bpage when copy

ring_buffer_read_page()/rb_remove_entries() may be called for
a partially consumed page.

Add a parameter for rb_remove_entries() and make it update
cpu_buffer->entries correctly for partially consumed pages.

ring_buffer_read_page() now returns the offset to the next event.

Init the bpage's time_stamp when return value is 0.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index eca2827..10d202e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2332,13 +2332,14 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
 static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *bpage)
+			      struct buffer_data_page *bpage,
+			      unsigned int offset)
 {
 	struct ring_buffer_event *event;
 	unsigned long head;
 
 	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = 0; head < local_read(&bpage->commit);
+	for (head = offset; head < local_read(&bpage->commit);
 	     head += rb_event_length(event)) {
 
 		event = __rb_data_page_index(bpage, head);
@@ -2410,8 +2411,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	if (!rpage)
  *		return error;
  *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
- *	if (ret)
- *		process_page(rpage);
+ *	if (ret >= 0)
+ *		process_page(rpage, ret);
  *
  * When @full is set, the function will not return true unless
  * the writer is off the reader page.
@@ -2422,8 +2423,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  responsible for that.
  *
  * Returns:
- *  1 if data has been transferred
- *  0 if no data has been transferred.
+ *  >=0 if data has been transferred, returns the offset of consumed data.
+ *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
 			    void **data_page, int cpu, int full)
@@ -2432,7 +2433,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	unsigned long flags;
-	int ret = 0;
+	unsigned int read;
+	int ret = -1;
 
 	if (!data_page)
 		return 0;
@@ -2454,24 +2456,29 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	/* check for data */
 	if (!local_read(&cpu_buffer->reader_page->page->commit))
 		goto out;
+
+	read = cpu_buffer->reader_page->read;
 	/*
 	 * If the writer is already off of the read page, then simply
 	 * switch the read page with the given page. Otherwise
 	 * we need to copy the data from the reader to the writer.
 	 */
 	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int read = cpu_buffer->reader_page->read;
 		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data,
-		       cpu_buffer->reader_page->page->data + read,
-		       commit - read);
+		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
 		/* consume what was read */
 		cpu_buffer->reader_page->read = commit;
+
+		/* update bpage */
+		local_set(&bpage->commit, commit);
+		if (!read)
+			bpage->time_stamp = rpage->time_stamp;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
@@ -2480,10 +2487,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		cpu_buffer->reader_page->read = 0;
 		*data_page = bpage;
 	}
-	ret = 1;
+	ret = read;
 
 	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage);
+	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.1


From 4543ae7ce1cb8b5ff27a59009e7991ea63791a71 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 9 Feb 2009 23:09:32 +0100
Subject: tracing: storage class should be before const qualifier

The C99 specification states in section 6.11.5:

The placement of a storage-class specifier other than at the beginning
of the declaration specifiers in a declaration is an obsolescent
feature.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sysprof.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 84ca9d8..4e2de4d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 	}
 }
 
-const static struct stacktrace_ops backtrace_ops = {
+static const struct stacktrace_ops backtrace_ops = {
 	.warning		= backtrace_warning,
 	.warning_symbol		= backtrace_warning_symbol,
 	.stack			= backtrace_stack,
-- 
cgit v1.1


From f54fc98aa656f334c1571df6e3ca9178ea223847 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:02:46 -0500
Subject: tracing: remove unneeded variable

Impact: clean up.

Remove the unnecessary variable ret.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_branch.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index f8ae2c5..c2e68d4 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -91,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
 int enable_branch_tracing(struct trace_array *tr)
 {
-	int ret = 0;
-
 	mutex_lock(&branch_tracing_mutex);
 	branch_tracer = tr;
 	/*
@@ -103,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
 	branch_tracing_enabled++;
 	mutex_unlock(&branch_tracing_mutex);
 
-	return ret;
+	return 0;
 }
 
 void disable_branch_tracing(void)
-- 
cgit v1.1


From 810dc73265cd690b2bc6010489b4317bba2cda39 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:05 -0500
Subject: tracing: provide correct return value after outputting the event

This patch is to make the function return early on failure, and give
correct return value on success.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 782ec0f..519a0ca 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -186,30 +186,30 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 	ret = trace_seq_printf(s,
 		" ------------------------------------------\n");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_cpu(s, cpu);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_proc(s, prev_pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = trace_seq_printf(s, " => ");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_proc(s, pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = trace_seq_printf(s,
 		"\n ------------------------------------------\n\n");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
-	return ret;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct ftrace_graph_ret_entry *
-- 
cgit v1.1


From c3706f005c3aaf570e71f0f083fdbb59a5a9fa2e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:18 -0500
Subject: tracing: fix typos in comments

Impact: clean up.

Fix typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c       | 8 ++++----
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_hw_branches.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10d202e..fa64e1f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(tracing_off);
  * tracing_off_permanent - permanently disable ring buffers
  *
  * This function, once called, will disable all ring buffers
- * permanenty.
+ * permanently.
  */
 void tracing_off_permanent(void)
 {
@@ -210,7 +210,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 commit;	/* write commited index */
+	local_t		 commit;	/* write committed index */
 	unsigned char	 data[];	/* data of buffer page */
 };
 
@@ -260,7 +260,7 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
-	struct buffer_page		*commit_page;	/* commited pages */
+	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -303,7 +303,7 @@ struct ring_buffer_iter {
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
  *
- * As a safty measure we check to make sure the data pages have not
+ * As a safety measure we check to make sure the data pages have not
  * been corrupted.
  */
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d898212..d7c175a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1963,7 +1963,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	struct tracer_opt *trace_opts = current_trace->flags->opts;
 
 
-	/* calulate max size */
+	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
 		len += 3; /* "no" and space */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e3e7db6..0794dd3 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -75,7 +75,7 @@ static void bts_trace_start(struct trace_array *tr)
 }
 
 /*
- * Start tracing on the current cpu.
+ * Stop tracing on the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_mutex must be locked.
-- 
cgit v1.1


From 4fd2735881bf4d8bf5e30979f31fc2f1b1d505fa Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:12 +0100
Subject: tracing: fix sparse warnings: make symbols static

Impact: make global variables and a global function static

The function '__trace_userstack' does not seem to have a caller, so it
is commented out.

Fix this sparse warnings:
  kernel/trace/trace.c:82:5: warning: symbol 'tracing_disabled' was not declared. Should it be static?
  kernel/trace/trace.c:600:10: warning: symbol 'trace_record_cmdline_disabled' was not declared. Should it be static?
  kernel/trace/trace.c:957:6: warning: symbol '__trace_userstack' was not declared. Should it be static?
  kernel/trace/trace.c:1694:5: warning: symbol 'tracing_release' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7c175a..c157ba7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -80,7 +80,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
  * of the tracer is successful. But that is the only place that sets
  * this back to zero.
  */
-int tracing_disabled = 1;
+static int tracing_disabled = 1;
 
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
@@ -626,7 +626,7 @@ static int cmdline_idx;
 static DEFINE_SPINLOCK(trace_cmdline_lock);
 
 /* temporary disable recording */
-atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
@@ -983,10 +983,12 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 #endif
 }
 
-void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#ifdef UNUSED
+static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 {
 	ftrace_trace_userstack(tr, flags, preempt_count());
 }
+#endif /* UNUSED */
 
 static void
 ftrace_trace_special(void *__tr,
@@ -1720,7 +1722,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int tracing_release(struct inode *inode, struct file *file)
+static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct trace_iterator *iter = m->private;
-- 
cgit v1.1


From 5e39841c45cf5e6ea930ede1b0303309e03037a2 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:34 +0100
Subject: tracing: fix sparse warnings: fix (un-)signedness

Fix these sparse warnings:

  kernel/trace/ring_buffer.c:70:37: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:84:39: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:96:43: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2500:40: warning: incorrect type in argument 3 (different signedness)
  kernel/trace/ring_buffer.c:2505:44: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2507:46: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/trace.c:2130:40: warning: incorrect type in argument 3 (different signedness)
  kernel/trace/trace.c:2280:40: warning: incorrect type in argument 3 (different signedness)

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 kernel/trace/trace.c       | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fa64e1f..dc18b5b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -59,7 +59,7 @@ enum {
 	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
 };
 
-static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -2501,7 +2501,7 @@ static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
-	long *p = filp->private_data;
+	unsigned long *p = filp->private_data;
 	char buf[64];
 	int r;
 
@@ -2517,9 +2517,9 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
-	long *p = filp->private_data;
+	unsigned long *p = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c157ba7..e243499 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2147,7 +2147,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 {
 	struct trace_array *tr = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
@@ -2295,9 +2295,9 @@ static ssize_t
 tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 		      size_t cnt, loff_t *ppos)
 {
-	long *ptr = filp->private_data;
+	unsigned long *ptr = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
-- 
cgit v1.1


From e7669b8e329255bbcb40af65b38e342825d97a46 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:45 +0100
Subject: tracing: fix sparse warning: attribute function with
 __acquires/__releases

Fix this sparse warning:

  kernel/trace/trace.c:458:9: warning: context imbalance in 'register_tracer' - unexpected unlock

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e243499..95f99a7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -459,6 +459,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
  * Register a new plugin tracer.
  */
 int register_tracer(struct tracer *type)
+__releases(kernel_lock)
+__acquires(kernel_lock)
 {
 	struct tracer *t;
 	int len;
-- 
cgit v1.1


From cf2592f59c0e8ed4308adbdb2e0a88655379d579 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Feb 2009 16:52:37 +0100
Subject: softlockup: ensure the task has been switched out once

When we check if a task has been switched out since the last scan, we might
have a race condition on the following scenario:

- the task is freshly created and scheduled

- it puts its state to TASK_UNINTERRUPTIBLE and is not yet switched out

- check_hung_task() scans this task and will report a false positive because
  t->nvcsw + t->nivcsw == t->last_switch_count == 0

Add a check for such cases.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c924de..022a492 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,7 +72,13 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
-	if (t->flags & PF_FROZEN)
+	/*
+	 * Ensure the task is not frozen.
+	 * Also, when a freshly created task is scheduled once, changes
+	 * its state to TASK_UNINTERRUPTIBLE without having ever been
+	 * switched out once, it musn't be checked.
+	 */
+	if (unlikely(t->flags & PF_FROZEN || !switch_count))
 		return;
 
 	if (switch_count != t->last_switch_count) {
-- 
cgit v1.1


From b22f4858126a6aa852ad745b94f6b25dbdea708e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Feb 2009 15:49:11 +0100
Subject: tracing/sysprof: add missing tracing_{start,stop}_record_cmdline()

Add the missing pair tracing_{start,stop}_record_cmdline() to record well
the cmdline associated with pid.

Changes in v2:

- fix a build error, the sched_switch tracer is needed to record the
  cmdline.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig         | 1 +
 kernel/trace/trace_sysprof.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3a33128..6ff928a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,6 +134,7 @@ config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
 	select TRACING
+	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
 	  tool.
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 84ca9d8..9902c15 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -238,6 +238,8 @@ static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
+	tracing_start_cmdline_record();
+
 	mutex_lock(&sample_timer_lock);
 	start_stack_timers();
 	tracer_enabled = 1;
@@ -247,6 +249,7 @@ static int stack_trace_init(struct trace_array *tr)
 
 static void stack_trace_reset(struct trace_array *tr)
 {
+	tracing_stop_cmdline_record();
 	stop_stack_trace(tr);
 }
 
-- 
cgit v1.1


From 00f62f614bb713027b9296068d1879fbca511eb7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Feb 2009 17:04:06 -0200
Subject: ring_buffer: pahole struct ring_buffer

While fixing some bugs in pahole (built-in.o files were not being
processed due to relocation problems) I found out about these packable
structures:

$ pahole --packable kernel/trace/ring_buffer.o  | grep ring
ring_buffer	72	64	8
ring_buffer_per_cpu	112	104	8

If we take a look at the current layout of struct ring_buffer we can see
that we have two 4 bytes holes.

$ pahole -C ring_buffer kernel/trace/ring_buffer.o
struct ring_buffer {
	unsigned int               pages;           /*     0     4 */
	unsigned int               flags;           /*     4     4 */
	int                        cpus;            /*     8     4 */

	/* XXX 4 bytes hole, try to pack */

	cpumask_var_t              cpumask;         /*    16     8 */
	atomic_t                   record_disabled; /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct mutex               mutex;           /*    32    32 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct ring_buffer_per_cpu * * buffers;     /*    64     8 */

	/* size: 72, cachelines: 2, members: 7 */
	/* sum members: 64, holes: 2, sum holes: 8 */
	/* last cacheline: 8 bytes */
};

So, if I ask pahole to reorganize it:

$ pahole -C ring_buffer --reorganize kernel/trace/ring_buffer.o

struct ring_buffer {
	unsigned int               pages;           /*     0     4 */
	unsigned int               flags;           /*     4     4 */
	int                        cpus;            /*     8     4 */
	atomic_t                   record_disabled; /*    12     4 */
	cpumask_var_t              cpumask;         /*    16     8 */
	struct mutex               mutex;           /*    24    32 */
	struct ring_buffer_per_cpu * * buffers;     /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */

	/* size: 64, cachelines: 1, members: 7 */
};   /* saved 8 bytes and 1 cacheline! */

We get it using just one 64 bytes cacheline.

To see what it did:

$ pahole -C ring_buffer --reorganize --show_reorg_steps \
	kernel/trace/ring_buffer.o | grep \/
/* Moving 'record_disabled' from after 'cpumask' to after 'cpus' */

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53ba3a6..27ef3bf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -273,8 +273,8 @@ struct ring_buffer {
 	unsigned			pages;
 	unsigned			flags;
 	int				cpus;
-	cpumask_var_t			cpumask;
 	atomic_t			record_disabled;
+	cpumask_var_t			cpumask;
 
 	struct mutex			mutex;
 
-- 
cgit v1.1


From 9f8d979f082a3ee1b27f32b7e0811b51c3ad1d15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 12 Feb 2009 13:10:17 +0100
Subject: softlockup: move 'one' to the softlockup section in sysctl.c

CONFIG_SOFTLOCKUP=y || CONFIG_DETECT_HUNG_TASKS=y is now the only user
of the 'one' constant in kernel/sysctl.c. Move it to the softlockup
block of constants.

This fixes a GCC warning.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b6b54c..6d2aeff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -90,6 +90,9 @@ extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
+#if defined(CONFIG_DETECT_HUNG_TASK) || defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+static int one = 1;
+#endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
 static int neg_one = -1;
@@ -100,7 +103,6 @@ static int two = 2;
 #endif
 
 static int zero;
-static int one = 1;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
-- 
cgit v1.1


From 45141d4667d208421ca787a3301542b6a5e0b112 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 13:19:48 -0500
Subject: ring-buffer: rename label out_unlock to out_reset

Impact: clean up

While reviewing the ring buffer code, I thougth I saw a bug with

	if (!__raw_spin_trylock(&cpu_buffer->lock))
		goto out_unlock;

But I forgot that we use a variable "lock_taken" that is set if
the spinlock is taken, and only unlock it if that variable is set.

To avoid further confusion from other reviewers, this patch
renames the label out_unlock with out_reset, which is the more
appropriate name.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc18b5b..f39d7e9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1017,7 +1017,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
-				goto out_unlock;
+				goto out_reset;
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1030,7 +1030,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		/* we grabbed the lock before incrementing */
 		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-			goto out_unlock;
+			goto out_reset;
 
 		/*
 		 * If for some reason, we had an interrupt storm that made
@@ -1039,12 +1039,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (unlikely(next_page == commit_page)) {
 			WARN_ON_ONCE(1);
-			goto out_unlock;
+			goto out_reset;
 		}
 
 		if (next_page == head_page) {
 			if (!(buffer->flags & RB_FL_OVERWRITE))
-				goto out_unlock;
+				goto out_reset;
 
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
@@ -1118,7 +1118,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	return event;
 
- out_unlock:
+ out_reset:
 	/* reset write */
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
-- 
cgit v1.1


From b5f9fd0f8a05c9bafb91a9a85b9110938d8e585b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 11 Feb 2009 13:57:25 -0500
Subject: tracing: convert c/p state power tracer to use tracepoints

Convert the c/p state "power" tracer to use tracepoints. Avoids a
function call when the tracer is disabled.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_power.c | 173 ++++++++++++++++++++++++++-------------------
 1 file changed, 102 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index b1d0d08..91ce672 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -21,15 +21,116 @@
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
 
+static void probe_power_start(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+
+
+static void probe_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static void probe_power_mark(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static int tracing_power_register(void)
+{
+	int ret;
+
+	ret = register_trace_power_start(probe_power_start);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_start\n");
+		return ret;
+	}
+	ret = register_trace_power_end(probe_power_end);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_end\n");
+		goto fail_start;
+	}
+	ret = register_trace_power_mark(probe_power_mark);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_mark\n");
+		goto fail_end;
+	}
+	return ret;
+fail_end:
+	unregister_trace_power_end(probe_power_end);
+fail_start:
+	unregister_trace_power_start(probe_power_start);
+	return ret;
+}
 
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
+	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
+	unregister_trace_power_start(probe_power_start);
+	unregister_trace_power_end(probe_power_end);
+	unregister_trace_power_mark(probe_power_mark);
 }
 
 
@@ -39,6 +140,7 @@ static int power_trace_init(struct trace_array *tr)
 	power_trace = tr;
 
 	trace_power_enabled = 1;
+	tracing_power_register();
 
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
@@ -95,74 +197,3 @@ static int init_power_trace(void)
 	return register_tracer(&power_tracer);
 }
 device_initcall(init_power_trace);
-
-void trace_power_start(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-}
-EXPORT_SYMBOL_GPL(trace_power_start);
-
-
-void trace_power_end(struct power_trace *it)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	preempt_disable();
-	it->end = ktime_get();
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_end);
-
-void trace_power_mark(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-	preempt_disable();
-	it->end = it->stamp;
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_mark);
-- 
cgit v1.1


From 6f2b9b9a9d750a9175dc79c74bfed5add840983c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 29 Jan 2009 16:03:20 +0100
Subject: timer: implement lockdep deadlock detection

This modifies the timer code in a way to allow lockdep to detect
deadlocks resulting from a lock being taken in the timer function
as well as around the del_timer_sync() call.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 kernel/timer.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64f..ef1c385 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
 	debug_object_free(timer, &timer_debug_descr);
 }
 
-static void __init_timer(struct timer_list *timer);
+static void __init_timer(struct timer_list *timer,
+			 const char *name,
+			 struct lock_class_key *key);
 
-void init_timer_on_stack(struct timer_list *timer)
+void init_timer_on_stack_key(struct timer_list *timer,
+			     const char *name,
+			     struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	__init_timer(timer);
+	__init_timer(timer, name, key);
 }
-EXPORT_SYMBOL_GPL(init_timer_on_stack);
+EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
 void destroy_timer_on_stack(struct timer_list *timer)
 {
@@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
 #endif
 
-static void __init_timer(struct timer_list *timer)
+static void __init_timer(struct timer_list *timer,
+			 const char *name,
+			 struct lock_class_key *key)
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
 	timer->start_pid = -1;
 	memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
+	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
 /**
@@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
  * init_timer() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer(struct timer_list *timer)
+void init_timer_key(struct timer_list *timer,
+		    const char *name,
+		    struct lock_class_key *key)
 {
 	debug_timer_init(timer);
-	__init_timer(timer);
+	__init_timer(timer, name, key);
 }
-EXPORT_SYMBOL(init_timer);
+EXPORT_SYMBOL(init_timer_key);
 
-void init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable_key(struct timer_list *timer,
+			       const char *name,
+			       struct lock_class_key *key)
 {
-	init_timer(timer);
+	init_timer_key(timer, name, key);
 	timer_set_deferrable(timer);
 }
-EXPORT_SYMBOL(init_timer_deferrable);
+EXPORT_SYMBOL(init_timer_deferrable_key);
 
 static inline void detach_timer(struct timer_list *timer,
 				int clear_pending)
@@ -789,6 +800,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  */
 int del_timer_sync(struct timer_list *timer)
 {
+#ifdef CONFIG_LOCKDEP
+	unsigned long flags;
+
+	local_irq_save(flags);
+	lock_map_acquire(&timer->lockdep_map);
+	lock_map_release(&timer->lockdep_map);
+	local_irq_restore(flags);
+#endif
+
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -861,10 +881,36 @@ static inline void __run_timers(struct tvec_base *base)
 
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
+
 			spin_unlock_irq(&base->lock);
 			{
 				int preempt_count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+				/*
+				 * It is permissible to free the timer from
+				 * inside the function that is called from
+				 * it, this we need to take into account for
+				 * lockdep too. To avoid bogus "held lock
+				 * freed" warnings as well as problems when
+				 * looking into timer->lockdep_map, make a
+				 * copy and use that here.
+				 */
+				struct lockdep_map lockdep_map =
+					timer->lockdep_map;
+#endif
+				/*
+				 * Couple the lock chain with the lock chain at
+				 * del_timer_sync() by acquiring the lock_map
+				 * around the fn() call here and in
+				 * del_timer_sync().
+				 */
+				lock_map_acquire(&lockdep_map);
+
 				fn(data);
+
+				lock_map_release(&lockdep_map);
+
 				if (preempt_count != preempt_count()) {
 					printk(KERN_ERR "huh, entered %p "
 					       "with preempt_count %08x, exited"
-- 
cgit v1.1


From cf40bd16fdad42c053040bcd3988f5fdedbb6c57 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 21 Jan 2009 08:12:39 +0100
Subject: lockdep: annotate reclaim context (__GFP_NOFS)

Here is another version, with the incremental patch rolled up, and
added reclaim context annotation to kswapd, and allocation tracing
to slab allocators (which may only ever reach the page allocator
in rare cases, so it is good to put annotations here too).

Haven't tested this version as such, but it should be getting closer
to merge worthy ;)

--
After noticing some code in mm/filemap.c accidentally perform a __GFP_FS
allocation when it should not have been, I thought it might be a good idea to
try to catch this kind of thing with lockdep.

I coded up a little idea that seems to work. Unfortunately the system has to
actually be in __GFP_FS page reclaim, then take the lock, before it will mark
it. But at least that might still be some orders of magnitude more common
(and more debuggable) than an actual deadlock condition, so we have some
improvement I hope (the concept is no less complete than discovery of a lock's
interrupt contexts).

I guess we could even do the same thing with __GFP_IO (normal reclaim), and
even GFP_NOIO locks too... but filesystems will have the most locks and fiddly
code paths, so let's start there and see how it goes.

It *seems* to work. I did a quick test.

=================================
[ INFO: inconsistent lock state ]
2.6.28-rc6-00007-ged31348-dirty #26
---------------------------------
inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage.
modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]
{in-reclaim-W} state was registered at:
  [<ffffffff80267bdb>] __lock_acquire+0x75b/0x1a60
  [<ffffffff80268f71>] lock_acquire+0x91/0xc0
  [<ffffffff8070f0e1>] mutex_lock_nested+0xb1/0x310
  [<ffffffffa002002b>] brd_init+0x2b/0x216 [brd]
  [<ffffffff8020903b>] _stext+0x3b/0x170
  [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
  [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b
  [<ffffffffffffffff>] 0xffffffffffffffff
irq event stamp: 3929
hardirqs last  enabled at (3929): [<ffffffff8070f2b5>] mutex_lock_nested+0x285/0x310
hardirqs last disabled at (3928): [<ffffffff8070f089>] mutex_lock_nested+0x59/0x310
softirqs last  enabled at (3732): [<ffffffff8061f623>] sk_filter+0x83/0xe0
softirqs last disabled at (3730): [<ffffffff8061f5b6>] sk_filter+0x16/0xe0

other info that might help us debug this:
1 lock held by modprobe/8526:
 #0:  (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]

stack backtrace:
Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26
Call Trace:
 [<ffffffff80265483>] print_usage_bug+0x193/0x1d0
 [<ffffffff80266530>] mark_lock+0xaf0/0xca0
 [<ffffffff80266735>] mark_held_locks+0x55/0xc0
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffff802667ca>] trace_reclaim_fs+0x2a/0x60
 [<ffffffff80285005>] __alloc_pages_internal+0x475/0x580
 [<ffffffff8070f29e>] ? mutex_lock_nested+0x26e/0x310
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffffa002006a>] brd_init+0x6a/0x216 [brd]
 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
 [<ffffffff8020903b>] _stext+0x3b/0x170
 [<ffffffff8070f8b9>] ? mutex_unlock+0x9/0x10
 [<ffffffff8070f83d>] ? __mutex_unlock_slowpath+0x10d/0x180
 [<ffffffff802669ec>] ? trace_hardirqs_on_caller+0x12c/0x190
 [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
 [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 229 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/lockdep_internals.h |   3 +-
 kernel/lockdep_proc.c      |   6 +-
 3 files changed, 222 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c35..977f940 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -310,12 +310,14 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE	1
 # define SOFTIRQ_VERBOSE	1
+# define RECLAIM_VERBOSE	1
 #else
 # define HARDIRQ_VERBOSE	0
 # define SOFTIRQ_VERBOSE	0
+# define RECLAIM_VERBOSE	0
 #endif
 
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
 /*
  * Quick filtering for interesting events:
  */
@@ -454,6 +456,10 @@ static const char *usage_str[] =
 	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
 	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
 	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
+	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
+	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
+	[LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -462,9 +468,10 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 }
 
 void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6)
 {
-	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.', *c5 = '.', *c6 = '.';
 
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 		*c1 = '+';
@@ -493,14 +500,29 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
 			*c4 = '?';
 	}
+
+	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
+		*c5 = '+';
+	else
+		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS)
+			*c5 = '-';
+
+	if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+		*c6 = '-';
+	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
+		*c6 = '+';
+		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+			*c6 = '?';
+	}
+
 }
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[KSYM_NAME_LEN], c1, c2, c3, c4;
+	char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6;
 	const char *name;
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4);
+	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
 
 	name = class->name;
 	if (!name) {
@@ -513,7 +535,7 @@ static void print_lock_name(struct lock_class *class)
 		if (class->subclass)
 			printk("/%d", class->subclass);
 	}
-	printk("){%c%c%c%c}", c1, c2, c3, c4);
+	printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
@@ -1306,6 +1328,26 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 					LOCK_ENABLED_SOFTIRQS, "soft"))
 		return 0;
 
+	/*
+	 * Prove that the new dependency does not connect a reclaim-fs-safe
+	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
+					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+		return 0;
+
+	/*
+	 * Prove that the new dependency does not connect a reclaim-fs-safe-read
+	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * the backwards-subgraph starting at <prev>, and the
+	 * forwards-subgraph starting at <next>:
+	 */
+	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
+					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read"))
+		return 0;
+
 	return 1;
 }
 
@@ -1949,6 +1991,14 @@ static int softirq_verbose(struct lock_class *class)
 	return 0;
 }
 
+static int reclaim_verbose(struct lock_class *class)
+{
+#if RECLAIM_VERBOSE
+	return class_filter(class);
+#endif
+	return 0;
+}
+
 #define STRICT_READ_CHECKS	1
 
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
@@ -2007,6 +2057,31 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_USED_IN_RECLAIM_FS:
+		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_HELD_OVER_RECLAIM_FS_READ))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-safe, check that this lock
+		 * took no reclaim-fs-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it reclaim-fs-safe, check that this lock
+		 * took no reclaim-fs-unsafe-read lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+				LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
 			return 0;
@@ -2033,6 +2108,19 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_USED_IN_RECLAIM_FS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-read-safe, check that this lock
+		 * took no reclaim-fs-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this,
+					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_ENABLED_HARDIRQS:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
@@ -2085,6 +2173,32 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_HELD_OVER_RECLAIM_FS:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
+			return 0;
+		if (!valid_state(curr, this, new_bit,
+				 LOCK_USED_IN_RECLAIM_FS_READ))
+			return 0;
+		/*
+		 * just marked it reclaim-fs-unsafe, check that no reclaim-fs-safe
+		 * lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it softirq-unsafe, check that no
+		 * softirq-safe-read lock in the system ever took
+		 * it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+				   LOCK_USED_IN_RECLAIM_FS_READ, "reclaim-fs-read"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	case LOCK_ENABLED_HARDIRQS_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
@@ -2115,6 +2229,21 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
+	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
+			return 0;
+#if STRICT_READ_CHECKS
+		/*
+		 * just marked it reclaim-fs-read-unsafe, check that no
+		 * reclaim-fs-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this,
+					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
+			return 0;
+#endif
+		if (reclaim_verbose(hlock_class(this)))
+			ret = 2;
+		break;
 	default:
 		WARN_ON(1);
 		break;
@@ -2123,11 +2252,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	return ret;
 }
 
+enum mark_type {
+	HARDIRQ,
+	SOFTIRQ,
+	RECLAIM_FS,
+};
+
 /*
  * Mark all held locks with a usage bit:
  */
 static int
-mark_held_locks(struct task_struct *curr, int hardirq)
+mark_held_locks(struct task_struct *curr, enum mark_type mark)
 {
 	enum lock_usage_bit usage_bit;
 	struct held_lock *hlock;
@@ -2136,17 +2271,32 @@ mark_held_locks(struct task_struct *curr, int hardirq)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		hlock = curr->held_locks + i;
 
-		if (hardirq) {
+		switch (mark) {
+		case HARDIRQ:
 			if (hlock->read)
 				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
 			else
 				usage_bit = LOCK_ENABLED_HARDIRQS;
-		} else {
+			break;
+
+		case SOFTIRQ:
 			if (hlock->read)
 				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
 			else
 				usage_bit = LOCK_ENABLED_SOFTIRQS;
+			break;
+
+		case RECLAIM_FS:
+			if (hlock->read)
+				usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ;
+			else
+				usage_bit = LOCK_HELD_OVER_RECLAIM_FS;
+			break;
+
+		default:
+			BUG();
 		}
+
 		if (!mark_lock(curr, hlock, usage_bit))
 			return 0;
 	}
@@ -2200,7 +2350,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	 * We are going to turn hardirqs on, so set the
 	 * usage bit for all held locks:
 	 */
-	if (!mark_held_locks(curr, 1))
+	if (!mark_held_locks(curr, HARDIRQ))
 		return;
 	/*
 	 * If we have softirqs enabled, then set the usage
@@ -2208,7 +2358,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	 * this bit from being set before)
 	 */
 	if (curr->softirqs_enabled)
-		if (!mark_held_locks(curr, 0))
+		if (!mark_held_locks(curr, SOFTIRQ))
 			return;
 
 	curr->hardirq_enable_ip = ip;
@@ -2288,7 +2438,7 @@ void trace_softirqs_on(unsigned long ip)
 	 * enabled too:
 	 */
 	if (curr->hardirqs_enabled)
-		mark_held_locks(curr, 0);
+		mark_held_locks(curr, SOFTIRQ);
 }
 
 /*
@@ -2317,6 +2467,31 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(&redundant_softirqs_off);
 }
 
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+	struct task_struct *curr = current;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	/* no reclaim without waiting on it */
+	if (!(gfp_mask & __GFP_WAIT))
+		return;
+
+	/* this guy won't enter reclaim */
+	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+		return;
+
+	/* We're only interested __GFP_FS allocations for now */
+	if (!(gfp_mask & __GFP_FS))
+		return;
+
+	if (DEBUG_LOCKS_WARN_ON(irqs_disabled()))
+		return;
+
+	mark_held_locks(curr, RECLAIM_FS);
+}
+
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
@@ -2362,6 +2537,22 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 		}
 	}
 
+	/*
+	 * We reuse the irq context infrastructure more broadly as a general
+	 * context checking code. This tests GFP_FS recursion (a lock taken
+	 * during reclaim for a GFP_FS allocation is held over a GFP_FS
+	 * allocation).
+	 */
+	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
+		if (hlock->read) {
+			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
+					return 0;
+		} else {
+			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
+					return 0;
+		}
+	}
+
 	return 1;
 }
 
@@ -2453,6 +2644,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_SOFTIRQS:
 	case LOCK_ENABLED_HARDIRQS_READ:
 	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_USED_IN_RECLAIM_FS:
+	case LOCK_USED_IN_RECLAIM_FS_READ:
+	case LOCK_HELD_OVER_RECLAIM_FS:
+	case LOCK_HELD_OVER_RECLAIM_FS_READ:
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
@@ -2966,6 +3161,16 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
+void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
+{
+	current->lockdep_reclaim_gfp = gfp_mask;
+}
+
+void lockdep_clear_current_reclaim_state(void)
+{
+	current->lockdep_reclaim_gfp = 0;
+}
+
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b1969..e887b78 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -32,7 +32,8 @@ extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
 
 extern void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b8..b84a1df 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
 {
 	struct lock_class *class = v;
 	struct lock_list *entry;
-	char c1, c2, c3, c4;
+	char c1, c2, c3, c4, c5, c6;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
 	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
 #endif
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4);
-	seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
+	seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6);
 
 	seq_printf(m, ": ");
 	print_name(m, class);
-- 
cgit v1.1


From 4fc95e867f1e75351b89db3c68212dfcce7ea563 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 13:10:52 +0100
Subject: lockdep: sanitize bit names

s/\(LOCKF\?_ENABLED_[^ ]*\)S\(_READ\)\?\>/\1\2/g

So that the USED_IN and ENABLED have the same names.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c      | 84 +++++++++++++++++++++++++--------------------------
 kernel/lockdep_proc.c | 12 ++++----
 2 files changed, 48 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 977f940..32f9447 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -450,12 +450,12 @@ static const char *usage_str[] =
 	[LOCK_USED] =			"initial-use ",
 	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W",
 	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W",
-	[LOCK_ENABLED_SOFTIRQS] =	"softirq-on-W",
-	[LOCK_ENABLED_HARDIRQS] =	"hardirq-on-W",
+	[LOCK_ENABLED_SOFTIRQ] =	"softirq-on-W",
+	[LOCK_ENABLED_HARDIRQ] =	"hardirq-on-W",
 	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R",
 	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
-	[LOCK_ENABLED_SOFTIRQS_READ] =	"softirq-on-R",
-	[LOCK_ENABLED_HARDIRQS_READ] =	"hardirq-on-R",
+	[LOCK_ENABLED_SOFTIRQ_READ] =	"softirq-on-R",
+	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
 	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
 	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
 	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
@@ -476,28 +476,28 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 		*c1 = '+';
 	else
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
 			*c1 = '-';
 
 	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
 		*c2 = '+';
 	else
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
 			*c2 = '-';
 
-	if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 		*c3 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
 		*c3 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 			*c3 = '?';
 	}
 
-	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 		*c4 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
 		*c4 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 			*c4 = '?';
 	}
 
@@ -1296,7 +1296,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
-					LOCK_ENABLED_HARDIRQS, "hard"))
+					LOCK_ENABLED_HARDIRQ, "hard"))
 		return 0;
 
 	/*
@@ -1306,7 +1306,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
-					LOCK_ENABLED_HARDIRQS, "hard-read"))
+					LOCK_ENABLED_HARDIRQ, "hard-read"))
 		return 0;
 
 	/*
@@ -1316,7 +1316,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
+					LOCK_ENABLED_SOFTIRQ, "soft"))
 		return 0;
 	/*
 	 * Prove that the new dependency does not connect a softirq-safe-read
@@ -1325,7 +1325,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
-					LOCK_ENABLED_SOFTIRQS, "soft"))
+					LOCK_ENABLED_SOFTIRQ, "soft"))
 		return 0;
 
 	/*
@@ -2008,17 +2008,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_HARDIRQS_READ))
+				 LOCK_ENABLED_HARDIRQ_READ))
 			return 0;
 		/*
 		 * just marked it hardirq-safe, check that this lock
 		 * took no hardirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQS, "hard"))
+					  LOCK_ENABLED_HARDIRQ, "hard"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2026,24 +2026,24 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no hardirq-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+				LOCK_ENABLED_HARDIRQ_READ, "hard-read"))
 			return 0;
 #endif
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_SOFTIRQS_READ))
+				 LOCK_ENABLED_SOFTIRQ_READ))
 			return 0;
 		/*
 		 * just marked it softirq-safe, check that this lock
 		 * took no softirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQS, "soft"))
+					  LOCK_ENABLED_SOFTIRQ, "soft"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2051,7 +2051,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no softirq-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+				LOCK_ENABLED_SOFTIRQ_READ, "soft-read"))
 			return 0;
 #endif
 		if (softirq_verbose(hlock_class(this)))
@@ -2083,27 +2083,27 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
 			return 0;
 		/*
 		 * just marked it hardirq-read-safe, check that this lock
 		 * took no hardirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQS, "hard"))
+					  LOCK_ENABLED_HARDIRQ, "hard"))
 			return 0;
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
 			return 0;
 		/*
 		 * just marked it softirq-read-safe, check that this lock
 		 * took no softirq-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQS, "soft"))
+					  LOCK_ENABLED_SOFTIRQ, "soft"))
 			return 0;
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
@@ -2121,7 +2121,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_HARDIRQS:
+	case LOCK_ENABLED_HARDIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2147,7 +2147,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_SOFTIRQS:
+	case LOCK_ENABLED_SOFTIRQ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2199,7 +2199,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_HARDIRQS_READ:
+	case LOCK_ENABLED_HARDIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2214,7 +2214,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2274,16 +2274,16 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 		switch (mark) {
 		case HARDIRQ:
 			if (hlock->read)
-				usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+				usage_bit = LOCK_ENABLED_HARDIRQ_READ;
 			else
-				usage_bit = LOCK_ENABLED_HARDIRQS;
+				usage_bit = LOCK_ENABLED_HARDIRQ;
 			break;
 
 		case SOFTIRQ:
 			if (hlock->read)
-				usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+				usage_bit = LOCK_ENABLED_SOFTIRQ_READ;
 			else
-				usage_bit = LOCK_ENABLED_SOFTIRQS;
+				usage_bit = LOCK_ENABLED_SOFTIRQ;
 			break;
 
 		case RECLAIM_FS:
@@ -2520,19 +2520,19 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 	if (!hlock->hardirqs_off) {
 		if (hlock->read) {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS_READ))
+					LOCK_ENABLED_HARDIRQ_READ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS_READ))
+						LOCK_ENABLED_SOFTIRQ_READ))
 					return 0;
 		} else {
 			if (!mark_lock(curr, hlock,
-					LOCK_ENABLED_HARDIRQS))
+					LOCK_ENABLED_HARDIRQ))
 				return 0;
 			if (curr->softirqs_enabled)
 				if (!mark_lock(curr, hlock,
-						LOCK_ENABLED_SOFTIRQS))
+						LOCK_ENABLED_SOFTIRQ))
 					return 0;
 		}
 	}
@@ -2640,10 +2640,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_SOFTIRQ:
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_ENABLED_HARDIRQS:
-	case LOCK_ENABLED_SOFTIRQS:
-	case LOCK_ENABLED_HARDIRQS_READ:
-	case LOCK_ENABLED_SOFTIRQS_READ:
+	case LOCK_ENABLED_HARDIRQ:
+	case LOCK_ENABLED_SOFTIRQ:
+	case LOCK_ENABLED_HARDIRQ_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 	case LOCK_HELD_OVER_RECLAIM_FS:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b84a1df..bd474fd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			nr_uncategorized++;
 		if (class->usage_mask & LOCKF_USED_IN_IRQ)
 			nr_irq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_IRQS)
+		if (class->usage_mask & LOCKF_ENABLED_IRQ)
 			nr_irq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
 			nr_softirq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
 			nr_softirq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
 			nr_hardirq_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
 			nr_hardirq_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
 			nr_irq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
 			nr_irq_read_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
 			nr_softirq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
 			nr_softirq_read_unsafe++;
 		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
 			nr_hardirq_read_safe++;
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
 			nr_hardirq_read_unsafe++;
 
 #ifdef CONFIG_PROVE_LOCKING
-- 
cgit v1.1


From a652d7081bc96b3094e85ca30e47f50185d2f717 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 13:13:11 +0100
Subject: lockdep: sanitize reclaim bit names

s/HELD_OVER/ENABLED/g

so that its similar to the hard and soft-irq names.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 32f9447..dd4716c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -458,8 +458,8 @@ static const char *usage_str[] =
 	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
 	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
 	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
-	[LOCK_HELD_OVER_RECLAIM_FS] =	"ov-reclaim-W",
-	[LOCK_HELD_OVER_RECLAIM_FS_READ] = "ov-reclaim-R",
+	[LOCK_ENABLED_RECLAIM_FS] =	"ov-reclaim-W",
+	[LOCK_ENABLED_RECLAIM_FS_READ] = "ov-reclaim-R",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -504,14 +504,14 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
 	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
 		*c5 = '+';
 	else
-		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS)
+		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS)
 			*c5 = '-';
 
-	if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+	if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
 		*c6 = '-';
 	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
 		*c6 = '+';
-		if (class->usage_mask & LOCKF_HELD_OVER_RECLAIM_FS_READ)
+		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
 			*c6 = '?';
 	}
 
@@ -1335,7 +1335,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
-					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 		return 0;
 
 	/*
@@ -1345,7 +1345,7 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
 	 * forwards-subgraph starting at <next>:
 	 */
 	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
-					LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs-read"))
+					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs-read"))
 		return 0;
 
 	return 1;
@@ -2058,17 +2058,17 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
-				 LOCK_HELD_OVER_RECLAIM_FS_READ))
+				 LOCK_ENABLED_RECLAIM_FS_READ))
 			return 0;
 		/*
 		 * just marked it reclaim-fs-safe, check that this lock
 		 * took no reclaim-fs-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 			return 0;
 #if STRICT_READ_CHECKS
 		/*
@@ -2076,7 +2076,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		 * took no reclaim-fs-unsafe-read lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-				LOCK_HELD_OVER_RECLAIM_FS_READ, "reclaim-fs-read"))
+				LOCK_ENABLED_RECLAIM_FS_READ, "reclaim-fs-read"))
 			return 0;
 #endif
 		if (reclaim_verbose(hlock_class(this)))
@@ -2109,14 +2109,14 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 			ret = 2;
 		break;
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_HELD_OVER_RECLAIM_FS))
+		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
 			return 0;
 		/*
 		 * just marked it reclaim-fs-read-safe, check that this lock
 		 * took no reclaim-fs-unsafe lock in the past:
 		 */
 		if (!check_usage_forwards(curr, this,
-					  LOCK_HELD_OVER_RECLAIM_FS, "reclaim-fs"))
+					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
 			return 0;
 		if (reclaim_verbose(hlock_class(this)))
 			ret = 2;
@@ -2173,7 +2173,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_HELD_OVER_RECLAIM_FS:
+	case LOCK_ENABLED_RECLAIM_FS:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
 			return 0;
 		if (!valid_state(curr, this, new_bit,
@@ -2229,7 +2229,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
-	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
 		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
 			return 0;
 #if STRICT_READ_CHECKS
@@ -2288,9 +2288,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 
 		case RECLAIM_FS:
 			if (hlock->read)
-				usage_bit = LOCK_HELD_OVER_RECLAIM_FS_READ;
+				usage_bit = LOCK_ENABLED_RECLAIM_FS_READ;
 			else
-				usage_bit = LOCK_HELD_OVER_RECLAIM_FS;
+				usage_bit = LOCK_ENABLED_RECLAIM_FS;
 			break;
 
 		default:
@@ -2646,8 +2646,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_HELD_OVER_RECLAIM_FS:
-	case LOCK_HELD_OVER_RECLAIM_FS_READ:
+	case LOCK_ENABLED_RECLAIM_FS:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
-- 
cgit v1.1


From 9fe51abf7a1c787f918f66fa3cef9cd0cedb3791 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:09:46 +0100
Subject: lockdep: lockdep_states.h

Introduce a header file to generate all the states from.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_states.h | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 kernel/lockdep_states.h

(limited to 'kernel')

diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
new file mode 100644
index 0000000..937039e
--- /dev/null
+++ b/kernel/lockdep_states.h
@@ -0,0 +1,3 @@
+LOCKDEP_STATE(HARDIRQ)
+LOCKDEP_STATE(SOFTIRQ)
+LOCKDEP_STATE(RECLAIM_FS)
-- 
cgit v1.1


From 36bfb9bb03db2002a8574600c6aeb4cdd1ba01a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:12:41 +0100
Subject: lockdep: simplify mark_held_locks

remove the explicit state iteration

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dd4716c..18e0990 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2253,11 +2253,19 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 }
 
 enum mark_type {
-	HARDIRQ,
-	SOFTIRQ,
-	RECLAIM_FS,
+#define LOCKDEP_STATE(__STATE)	__STATE,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 };
 
+#define MARK_HELD_CASE(__STATE)						\
+	case __STATE:							\
+		if (hlock->read)					\
+			usage_bit = LOCK_ENABLED_##__STATE##_READ;	\
+		else							\
+			usage_bit = LOCK_ENABLED_##__STATE;		\
+		break;
+
 /*
  * Mark all held locks with a usage bit:
  */
@@ -2272,27 +2280,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 		hlock = curr->held_locks + i;
 
 		switch (mark) {
-		case HARDIRQ:
-			if (hlock->read)
-				usage_bit = LOCK_ENABLED_HARDIRQ_READ;
-			else
-				usage_bit = LOCK_ENABLED_HARDIRQ;
-			break;
-
-		case SOFTIRQ:
-			if (hlock->read)
-				usage_bit = LOCK_ENABLED_SOFTIRQ_READ;
-			else
-				usage_bit = LOCK_ENABLED_SOFTIRQ;
-			break;
-
-		case RECLAIM_FS:
-			if (hlock->read)
-				usage_bit = LOCK_ENABLED_RECLAIM_FS_READ;
-			else
-				usage_bit = LOCK_ENABLED_RECLAIM_FS;
-			break;
-
+#define LOCKDEP_STATE(__STATE) MARK_HELD_CASE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 		default:
 			BUG();
 		}
-- 
cgit v1.1


From 5346417e17daf5a7712e4cf030b45414e46607cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:15:53 +0100
Subject: lockdep: simplify mark_lock()

remove the state iteration

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 18e0990..e68bd7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2626,18 +2626,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		return 0;
 
 	switch (new_bit) {
-	case LOCK_USED_IN_HARDIRQ:
-	case LOCK_USED_IN_SOFTIRQ:
-	case LOCK_USED_IN_HARDIRQ_READ:
-	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_ENABLED_HARDIRQ:
-	case LOCK_ENABLED_SOFTIRQ:
-	case LOCK_ENABLED_HARDIRQ_READ:
-	case LOCK_ENABLED_SOFTIRQ_READ:
-	case LOCK_USED_IN_RECLAIM_FS:
-	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_ENABLED_RECLAIM_FS:
-	case LOCK_ENABLED_RECLAIM_FS_READ:
+#define LOCKDEP_STATE(__STATE)			\
+	case LOCK_USED_IN_##__STATE:		\
+	case LOCK_USED_IN_##__STATE##_READ:	\
+	case LOCK_ENABLED_##__STATE:		\
+	case LOCK_ENABLED_##__STATE##_READ:
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 		ret = mark_lock_irq(curr, this, new_bit);
 		if (!ret)
 			return 0;
-- 
cgit v1.1


From 9851673bc32bc9fcafbbaeffc858ead434bd6d58 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:18:40 +0100
Subject: lockdep: move state bit definitions around

For convenience later.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_internals.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/lockdep_states.h    |  6 ++++++
 2 files changed, 52 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index e887b78..1352409 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -7,6 +7,52 @@
  */
 
 /*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit {
+	LOCK_USED = 0,
+	LOCK_USED_IN_HARDIRQ,
+	LOCK_USED_IN_SOFTIRQ,
+	LOCK_USED_IN_RECLAIM_FS,
+	LOCK_ENABLED_SOFTIRQ,
+	LOCK_ENABLED_HARDIRQ,
+	LOCK_ENABLED_RECLAIM_FS,
+	LOCK_USED_IN_HARDIRQ_READ,
+	LOCK_USED_IN_SOFTIRQ_READ,
+	LOCK_USED_IN_RECLAIM_FS_READ,
+	LOCK_ENABLED_SOFTIRQ_READ,
+	LOCK_ENABLED_HARDIRQ_READ,
+	LOCK_ENABLED_RECLAIM_FS_READ,
+	LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define LOCKF_USED			(1 << LOCK_USED)
+#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
+#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
+#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
+#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
+#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
+
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
+#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
+#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
+#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
+#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
+
+#define LOCKF_ENABLED_IRQ_READ \
+		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
+/*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
  *
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
index 937039e..995b0cc 100644
--- a/kernel/lockdep_states.h
+++ b/kernel/lockdep_states.h
@@ -1,3 +1,9 @@
+/*
+ * Lockdep states,
+ *
+ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
+ * you add one, or come up with a nice dynamic solution.
+ */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
 LOCKDEP_STATE(RECLAIM_FS)
-- 
cgit v1.1


From d7b1b02134272840f4b655136e00c461e1cf1d53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:38:38 +0100
Subject: lockdep: generate the state bit definitions

Generate the state bit definitions from the lockdep_states.h file.

Also, move LOCK_USED to last, so that the

 USED_IN
 USED_IN_READ
 ENABLED
 ENABLED_READ

states are nicely bit aligned -- we're going to use that property

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_internals.h | 47 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 1352409..7e653e6 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -10,43 +10,36 @@
  * Lock-class usage-state bits:
  */
 enum lock_usage_bit {
-	LOCK_USED = 0,
-	LOCK_USED_IN_HARDIRQ,
-	LOCK_USED_IN_SOFTIRQ,
-	LOCK_USED_IN_RECLAIM_FS,
-	LOCK_ENABLED_SOFTIRQ,
-	LOCK_ENABLED_HARDIRQ,
-	LOCK_ENABLED_RECLAIM_FS,
-	LOCK_USED_IN_HARDIRQ_READ,
-	LOCK_USED_IN_SOFTIRQ_READ,
-	LOCK_USED_IN_RECLAIM_FS_READ,
-	LOCK_ENABLED_SOFTIRQ_READ,
-	LOCK_ENABLED_HARDIRQ_READ,
-	LOCK_ENABLED_RECLAIM_FS_READ,
+#define LOCKDEP_STATE(__STATE)		\
+	LOCK_USED_IN_##__STATE,		\
+	LOCK_USED_IN_##__STATE##_READ,	\
+	LOCK_ENABLED_##__STATE,		\
+	LOCK_ENABLED_##__STATE##_READ,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+	LOCK_USED,
 	LOCK_USAGE_STATES
 };
 
 /*
  * Usage-state bitmasks:
  */
-#define LOCKF_USED			(1 << LOCK_USED)
-#define LOCKF_USED_IN_HARDIRQ		(1 << LOCK_USED_IN_HARDIRQ)
-#define LOCKF_USED_IN_SOFTIRQ		(1 << LOCK_USED_IN_SOFTIRQ)
-#define LOCKF_USED_IN_RECLAIM_FS	(1 << LOCK_USED_IN_RECLAIM_FS)
-#define LOCKF_ENABLED_HARDIRQ		(1 << LOCK_ENABLED_HARDIRQ)
-#define LOCKF_ENABLED_SOFTIRQ		(1 << LOCK_ENABLED_SOFTIRQ)
-#define LOCKF_ENABLED_RECLAIM_FS	(1 << LOCK_ENABLED_RECLAIM_FS)
+#define __LOCKF(__STATE)	LOCKF_##__STATE = (1 << LOCK_##__STATE),
+
+enum {
+#define LOCKDEP_STATE(__STATE)						\
+	__LOCKF(USED_IN_##__STATE)					\
+	__LOCKF(USED_IN_##__STATE##_READ)				\
+	__LOCKF(ENABLED_##__STATE)					\
+	__LOCKF(ENABLED_##__STATE##_READ)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+	__LOCKF(USED)
+};
 
 #define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
 #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
 
-#define LOCKF_USED_IN_HARDIRQ_READ	(1 << LOCK_USED_IN_HARDIRQ_READ)
-#define LOCKF_USED_IN_SOFTIRQ_READ	(1 << LOCK_USED_IN_SOFTIRQ_READ)
-#define LOCKF_USED_IN_RECLAIM_FS_READ	(1 << LOCK_USED_IN_RECLAIM_FS_READ)
-#define LOCKF_ENABLED_HARDIRQ_READ	(1 << LOCK_ENABLED_HARDIRQ_READ)
-#define LOCKF_ENABLED_SOFTIRQ_READ	(1 << LOCK_ENABLED_SOFTIRQ_READ)
-#define LOCKF_ENABLED_RECLAIM_FS_READ	(1 << LOCK_ENABLED_RECLAIM_FS_READ)
-
 #define LOCKF_ENABLED_IRQ_READ \
 		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
 #define LOCKF_USED_IN_IRQ_READ \
-- 
cgit v1.1


From fabe9c42c6328de314d811887b4752eb3d202291 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 14:51:01 +0100
Subject: lockdep: generate usage strings

generate the usage strings

XXX capital invasion :-(

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e68bd7d..d31f7f8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -445,21 +445,21 @@ atomic_t nr_find_usage_backwards_recursions;
  * Locking printouts:
  */
 
+#define __STR(foo)	#foo
+#define STR(foo)	__STR(foo)
+
+#define __USAGE(__STATE)						\
+	[LOCK_USED_IN_##__STATE] = "IN-"STR(__STATE)"-W",		\
+	[LOCK_ENABLED_##__STATE] = STR(__STATE)"-ON-W",			\
+	[LOCK_USED_IN_##__STATE##_READ] = "IN-"STR(__STATE)"-R",	\
+	[LOCK_ENABLED_##__STATE##_READ] = STR(__STATE)"-ON-R",
+
 static const char *usage_str[] =
 {
-	[LOCK_USED] =			"initial-use ",
-	[LOCK_USED_IN_HARDIRQ] =	"in-hardirq-W",
-	[LOCK_USED_IN_SOFTIRQ] =	"in-softirq-W",
-	[LOCK_ENABLED_SOFTIRQ] =	"softirq-on-W",
-	[LOCK_ENABLED_HARDIRQ] =	"hardirq-on-W",
-	[LOCK_USED_IN_HARDIRQ_READ] =	"in-hardirq-R",
-	[LOCK_USED_IN_SOFTIRQ_READ] =	"in-softirq-R",
-	[LOCK_ENABLED_SOFTIRQ_READ] =	"softirq-on-R",
-	[LOCK_ENABLED_HARDIRQ_READ] =	"hardirq-on-R",
-	[LOCK_USED_IN_RECLAIM_FS] =	"in-reclaim-W",
-	[LOCK_USED_IN_RECLAIM_FS_READ] = "in-reclaim-R",
-	[LOCK_ENABLED_RECLAIM_FS] =	"ov-reclaim-W",
-	[LOCK_ENABLED_RECLAIM_FS_READ] = "ov-reclaim-R",
+#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+	[LOCK_USED] = "INITIAL USE",
 };
 
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
-- 
cgit v1.1


From 6a6904d3475cc5e4a506b5c3c3684444e22c4cc4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:07:44 +0100
Subject: lockdep: split up mark_lock_irq()

split mark_lock_irq() into 4 simple helper functions

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 372 ++++++++++++++++++++++---------------------------------
 1 file changed, 147 insertions(+), 225 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d31f7f8..0b863c8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2001,6 +2001,109 @@ static int reclaim_verbose(struct lock_class *class)
 
 #define STRICT_READ_CHECKS	1
 
+static int
+mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
+		      int new_bit, int excl_bit,
+		      const char *name, const char *rname,
+		      int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+	if (!valid_state(curr, this, new_bit, excl_bit + 1))
+		return 0;
+	/*
+	 * just marked it hardirq-safe, check that this lock
+	 * took no hardirq-unsafe lock in the past:
+	 */
+	if (!check_usage_forwards(curr, this, excl_bit, name))
+		return 0;
+#if STRICT_READ_CHECKS
+	/*
+	 * just marked it hardirq-safe, check that this lock
+	 * took no hardirq-unsafe-read lock in the past:
+	 */
+	if (!check_usage_forwards(curr, this, excl_bit + 1, rname))
+		return 0;
+#endif
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
+static int
+mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
+			   int new_bit, int excl_bit,
+			   const char *name, const char *rname,
+			   int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+	/*
+	 * just marked it hardirq-read-safe, check that this lock
+	 * took no hardirq-unsafe lock in the past:
+	 */
+	if (!check_usage_forwards(curr, this, excl_bit, name))
+		return 0;
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
+static int
+mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
+		      int new_bit, int excl_bit,
+		      const char *name, const char *rname,
+		      int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+	if (!valid_state(curr, this, new_bit, excl_bit + 1))
+		return 0;
+	/*
+	 * just marked it hardirq-unsafe, check that no hardirq-safe
+	 * lock in the system ever took it in the past:
+	 */
+	if (!check_usage_backwards(curr, this, excl_bit, name))
+		return 0;
+#if STRICT_READ_CHECKS
+	/*
+	 * just marked it hardirq-unsafe, check that no
+	 * hardirq-safe-read lock in the system ever took
+	 * it in the past:
+	 */
+	if (!check_usage_backwards(curr, this, excl_bit + 1, rname))
+		return 0;
+#endif
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
+static int
+mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
+			   int new_bit, int excl_bit,
+			   const char *name, const char *rname,
+			   int (*verbose)(struct lock_class *class))
+{
+	if (!valid_state(curr, this, new_bit, excl_bit))
+		return 0;
+#if STRICT_READ_CHECKS
+	/*
+	 * just marked it hardirq-read-unsafe, check that no
+	 * hardirq-safe lock in the system ever took it in the past:
+	 */
+	if (!check_usage_backwards(curr, this, excl_bit, name))
+		return 0;
+#endif
+	if (verbose(hlock_class(this)))
+		return 2;
+
+	return 1;
+}
+
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
@@ -2008,242 +2111,61 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_HARDIRQ_READ))
-			return 0;
-		/*
-		 * just marked it hardirq-safe, check that this lock
-		 * took no hardirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQ, "hard"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it hardirq-safe, check that this lock
-		 * took no hardirq-unsafe-read lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_HARDIRQ_READ, "hard-read"))
-			return 0;
-#endif
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in(curr, this, new_bit,
+				LOCK_ENABLED_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_SOFTIRQ_READ))
-			return 0;
-		/*
-		 * just marked it softirq-safe, check that this lock
-		 * took no softirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQ, "soft"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-safe, check that this lock
-		 * took no softirq-unsafe-read lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_SOFTIRQ_READ, "soft-read"))
-			return 0;
-#endif
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in(curr, this, new_bit,
+				LOCK_ENABLED_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_ENABLED_RECLAIM_FS_READ))
-			return 0;
-		/*
-		 * just marked it reclaim-fs-safe, check that this lock
-		 * took no reclaim-fs-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it reclaim-fs-safe, check that this lock
-		 * took no reclaim-fs-unsafe-read lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-				LOCK_ENABLED_RECLAIM_FS_READ, "reclaim-fs-read"))
-			return 0;
-#endif
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in(curr, this, new_bit,
+				LOCK_ENABLED_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	case LOCK_USED_IN_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQ))
-			return 0;
-		/*
-		 * just marked it hardirq-read-safe, check that this lock
-		 * took no hardirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_HARDIRQ, "hard"))
-			return 0;
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in_read(curr, this, new_bit,
+				LOCK_ENABLED_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQ))
-			return 0;
-		/*
-		 * just marked it softirq-read-safe, check that this lock
-		 * took no softirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_SOFTIRQ, "soft"))
-			return 0;
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in_read(curr, this, new_bit,
+				LOCK_ENABLED_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_ENABLED_RECLAIM_FS))
-			return 0;
-		/*
-		 * just marked it reclaim-fs-read-safe, check that this lock
-		 * took no reclaim-fs-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this,
-					  LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_used_in_read(curr, this, new_bit,
+				LOCK_ENABLED_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	case LOCK_ENABLED_HARDIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_USED_IN_HARDIRQ_READ))
-			return 0;
-		/*
-		 * just marked it hardirq-unsafe, check that no hardirq-safe
-		 * lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_HARDIRQ, "hard"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it hardirq-unsafe, check that no
-		 * hardirq-safe-read lock in the system ever took
-		 * it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-				   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
-			return 0;
-#endif
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled(curr, this, new_bit,
+				LOCK_USED_IN_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_USED_IN_SOFTIRQ_READ))
-			return 0;
-		/*
-		 * just marked it softirq-unsafe, check that no softirq-safe
-		 * lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_SOFTIRQ, "soft"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-unsafe, check that no
-		 * softirq-safe-read lock in the system ever took
-		 * it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
-			return 0;
-#endif
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled(curr, this, new_bit,
+				LOCK_USED_IN_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
-			return 0;
-		if (!valid_state(curr, this, new_bit,
-				 LOCK_USED_IN_RECLAIM_FS_READ))
-			return 0;
-		/*
-		 * just marked it reclaim-fs-unsafe, check that no reclaim-fs-safe
-		 * lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-unsafe, check that no
-		 * softirq-safe-read lock in the system ever took
-		 * it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-				   LOCK_USED_IN_RECLAIM_FS_READ, "reclaim-fs-read"))
-			return 0;
-#endif
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled(curr, this, new_bit,
+				LOCK_USED_IN_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	case LOCK_ENABLED_HARDIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it hardirq-read-unsafe, check that no
-		 * hardirq-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_HARDIRQ, "hard"))
-			return 0;
-#endif
-		if (hardirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled_read(curr, this, new_bit,
+				LOCK_USED_IN_HARDIRQ,
+				"hard", "hard-read", hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it softirq-read-unsafe, check that no
-		 * softirq-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_SOFTIRQ, "soft"))
-			return 0;
-#endif
-		if (softirq_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled_read(curr, this, new_bit,
+				LOCK_USED_IN_SOFTIRQ,
+				"soft", "soft-read", softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
-		if (!valid_state(curr, this, new_bit, LOCK_USED_IN_RECLAIM_FS))
-			return 0;
-#if STRICT_READ_CHECKS
-		/*
-		 * just marked it reclaim-fs-read-unsafe, check that no
-		 * reclaim-fs-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this,
-					   LOCK_USED_IN_RECLAIM_FS, "reclaim-fs"))
-			return 0;
-#endif
-		if (reclaim_verbose(hlock_class(this)))
-			ret = 2;
-		break;
+		return mark_lock_irq_enabled_read(curr, this, new_bit,
+				LOCK_USED_IN_RECLAIM_FS,
+				"reclaim-fs", "reclaim-fs-read",
+				reclaim_verbose);
+
 	default:
 		WARN_ON(1);
 		break;
-- 
cgit v1.1


From 604de3b5b63ebc33a762c44d9c742f235b010346 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:24:44 +0100
Subject: lockdep: simplify the mark_lock_irq() helpers

In order to unify them, take some arguments away

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 60 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0b863c8..989a60b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2001,12 +2001,38 @@ static int reclaim_verbose(struct lock_class *class)
 
 #define STRICT_READ_CHECKS	1
 
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
+{
+	return state_names[bit >> 2];
+}
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_rname(enum lock_usage_bit bit)
+{
+	return state_rnames[bit >> 2];
+}
+
 static int
 mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 		      int new_bit, int excl_bit,
-		      const char *name, const char *rname,
 		      int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2034,9 +2060,11 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 static int
 mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 			   int new_bit, int excl_bit,
-			   const char *name, const char *rname,
 			   int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	/*
@@ -2054,9 +2082,11 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 static int
 mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 		      int new_bit, int excl_bit,
-		      const char *name, const char *rname,
 		      int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2085,9 +2115,11 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 static int
 mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
 			   int new_bit, int excl_bit,
-			   const char *name, const char *rname,
 			   int (*verbose)(struct lock_class *class))
 {
+	const char *name = state_name(new_bit);
+	const char *rname = state_rname(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 #if STRICT_READ_CHECKS
@@ -2113,57 +2145,53 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_HARDIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
 				LOCK_ENABLED_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
 				LOCK_ENABLED_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
 		return mark_lock_irq_used_in(curr, this, new_bit,
 				LOCK_ENABLED_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
 				LOCK_ENABLED_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
 				LOCK_ENABLED_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
 				LOCK_ENABLED_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
 				LOCK_USED_IN_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
 				LOCK_USED_IN_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
 		return mark_lock_irq_enabled(curr, this, new_bit,
 				LOCK_USED_IN_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
 				LOCK_USED_IN_HARDIRQ,
-				"hard", "hard-read", hardirq_verbose);
+				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
 				LOCK_USED_IN_SOFTIRQ,
-				"soft", "soft-read", softirq_verbose);
+				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
 				LOCK_USED_IN_RECLAIM_FS,
-				"reclaim-fs", "reclaim-fs-read",
 				reclaim_verbose);
 
 	default:
-- 
cgit v1.1


From f989209e2f604730888a6daa3b3ff30ed0c9d7c0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:09:59 +0100
Subject: lockdep: further simplify mark_lock_irq() helpers

take away another parameter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 989a60b..306d0b8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2025,14 +2025,35 @@ static inline const char *state_rname(enum lock_usage_bit bit)
 	return state_rnames[bit >> 2];
 }
 
+static int exclusive_bit(int new_bit)
+{
+	/*
+	 * USED_IN
+	 * USED_IN_READ
+	 * ENABLED
+	 * ENABLED_READ
+	 *
+	 * bit 0 - write/read
+	 * bit 1 - used_in/enabled
+	 * bit 2+  state
+	 */
+
+	int state = new_bit & ~3;
+	int dir = new_bit & 2;
+
+	return state | (dir ^ 2);
+}
+
 static int
 mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
-		      int new_bit, int excl_bit,
+		      int new_bit,
 		      int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2059,12 +2080,14 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit, int excl_bit,
+			   int new_bit,
 			   int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	/*
@@ -2081,12 +2104,14 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
-		      int new_bit, int excl_bit,
+		      int new_bit,
 		      int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
@@ -2114,12 +2139,14 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit, int excl_bit,
+			   int new_bit,
 			   int (*verbose)(struct lock_class *class))
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
+	int excl_bit = exclusive_bit(new_bit);
+
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 #if STRICT_READ_CHECKS
@@ -2144,54 +2171,42 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
-				LOCK_ENABLED_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
 		return mark_lock_irq_used_in(curr, this, new_bit,
-				LOCK_ENABLED_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
 		return mark_lock_irq_used_in(curr, this, new_bit,
-				LOCK_ENABLED_RECLAIM_FS,
 				reclaim_verbose);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				LOCK_ENABLED_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				LOCK_ENABLED_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
 		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				LOCK_ENABLED_RECLAIM_FS,
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
-				LOCK_USED_IN_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
 		return mark_lock_irq_enabled(curr, this, new_bit,
-				LOCK_USED_IN_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
 		return mark_lock_irq_enabled(curr, this, new_bit,
-				LOCK_USED_IN_RECLAIM_FS,
 				reclaim_verbose);
 
 	case LOCK_ENABLED_HARDIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				LOCK_USED_IN_HARDIRQ,
 				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				LOCK_USED_IN_SOFTIRQ,
 				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
 		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				LOCK_USED_IN_RECLAIM_FS,
 				reclaim_verbose);
 
 	default:
-- 
cgit v1.1


From cd95302d255264c5e6ebe1063320d80517bf2f83 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:38:21 +0100
Subject: lockdep: simplify mark_lock_irq() helpers #3

Kill another argument

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 65 +++++++++++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 306d0b8..e0a027d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1975,7 +1975,7 @@ void print_irqtrace_events(struct task_struct *curr)
 	print_ip_sym(curr->softirq_disable_ip);
 }
 
-static int hardirq_verbose(struct lock_class *class)
+static int HARDIRQ_verbose(struct lock_class *class)
 {
 #if HARDIRQ_VERBOSE
 	return class_filter(class);
@@ -1983,7 +1983,7 @@ static int hardirq_verbose(struct lock_class *class)
 	return 0;
 }
 
-static int softirq_verbose(struct lock_class *class)
+static int SOFTIRQ_verbose(struct lock_class *class)
 {
 #if SOFTIRQ_VERBOSE
 	return class_filter(class);
@@ -1991,7 +1991,7 @@ static int softirq_verbose(struct lock_class *class)
 	return 0;
 }
 
-static int reclaim_verbose(struct lock_class *class)
+static int RECLAIM_FS_verbose(struct lock_class *class)
 {
 #if RECLAIM_VERBOSE
 	return class_filter(class);
@@ -2025,6 +2025,19 @@ static inline const char *state_rname(enum lock_usage_bit bit)
 	return state_rnames[bit >> 2];
 }
 
+static int (*state_verbose_f[])(struct lock_class *class) = {
+#define LOCKDEP_STATE(__STATE) \
+	__STATE##_verbose,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline int state_verbose(enum lock_usage_bit bit,
+				struct lock_class *class)
+{
+	return state_verbose_f[bit >> 2](class);
+}
+
 static int exclusive_bit(int new_bit)
 {
 	/*
@@ -2046,8 +2059,7 @@ static int exclusive_bit(int new_bit)
 
 static int
 mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
-		      int new_bit,
-		      int (*verbose)(struct lock_class *class))
+		      int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2072,7 +2084,7 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 	if (!check_usage_forwards(curr, this, excl_bit + 1, rname))
 		return 0;
 #endif
-	if (verbose(hlock_class(this)))
+	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
 	return 1;
@@ -2080,8 +2092,7 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit,
-			   int (*verbose)(struct lock_class *class))
+			   int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2096,7 +2107,7 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 	 */
 	if (!check_usage_forwards(curr, this, excl_bit, name))
 		return 0;
-	if (verbose(hlock_class(this)))
+	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
 	return 1;
@@ -2104,8 +2115,7 @@ mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
-		      int new_bit,
-		      int (*verbose)(struct lock_class *class))
+		      int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2131,7 +2141,7 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 	if (!check_usage_backwards(curr, this, excl_bit + 1, rname))
 		return 0;
 #endif
-	if (verbose(hlock_class(this)))
+	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
 	return 1;
@@ -2139,8 +2149,7 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 
 static int
 mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit,
-			   int (*verbose)(struct lock_class *class))
+			   int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
@@ -2170,44 +2179,24 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 
 	switch(new_bit) {
 	case LOCK_USED_IN_HARDIRQ:
-		return mark_lock_irq_used_in(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ:
-		return mark_lock_irq_used_in(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS:
-		return mark_lock_irq_used_in(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_used_in(curr, this, new_bit);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_USED_IN_SOFTIRQ_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_used_in_read(curr, this, new_bit);
 
 	case LOCK_ENABLED_HARDIRQ:
-		return mark_lock_irq_enabled(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ:
-		return mark_lock_irq_enabled(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS:
-		return mark_lock_irq_enabled(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_enabled(curr, this, new_bit);
 
 	case LOCK_ENABLED_HARDIRQ_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				hardirq_verbose);
 	case LOCK_ENABLED_SOFTIRQ_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				softirq_verbose);
 	case LOCK_ENABLED_RECLAIM_FS_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit,
-				reclaim_verbose);
+		return mark_lock_irq_enabled_read(curr, this, new_bit);
 
 	default:
 		WARN_ON(1);
-- 
cgit v1.1


From 780e820b2dfefdfead9243724c2d2b73f379fda6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:51:29 +0100
Subject: lockdep: merge the _READ mark_lock_irq() helpers

The _READ helpers show remarkable similarity, merge them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 61 +++++++++++++++++++++-----------------------------------
 1 file changed, 23 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e0a027d..2d95f9d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2091,22 +2091,34 @@ mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
 }
 
 static int
-mark_lock_irq_used_in_read(struct task_struct *curr, struct held_lock *this,
+mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
 			   int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
 	int excl_bit = exclusive_bit(new_bit);
+	int dir = new_bit & 2;
 
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
-	/*
-	 * just marked it hardirq-read-safe, check that this lock
-	 * took no hardirq-unsafe lock in the past:
-	 */
-	if (!check_usage_forwards(curr, this, excl_bit, name))
-		return 0;
+
+	if (!dir) {
+		/*
+		 * just marked it hardirq-read-safe, check that this lock
+		 * took no hardirq-unsafe lock in the past:
+		 */
+		if (!check_usage_forwards(curr, this, excl_bit, name))
+			return 0;
+	} else if (STRICT_READ_CHECKS) {
+		/*
+		 * just marked it hardirq-read-unsafe, check that no
+		 * hardirq-safe lock in the system ever took it in the past:
+		 */
+		if (!check_usage_backwards(curr, this, excl_bit, name))
+			return 0;
+	}
+
 	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
 
@@ -2147,31 +2159,6 @@ mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int
-mark_lock_irq_enabled_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit)
-{
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
-	int excl_bit = exclusive_bit(new_bit);
-
-	if (!valid_state(curr, this, new_bit, excl_bit))
-		return 0;
-#if STRICT_READ_CHECKS
-	/*
-	 * just marked it hardirq-read-unsafe, check that no
-	 * hardirq-safe lock in the system ever took it in the past:
-	 */
-	if (!check_usage_backwards(curr, this, excl_bit, name))
-		return 0;
-#endif
-	if (verbose(hlock_class(this)))
-		return 2;
-
-	return 1;
-}
-
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
@@ -2186,18 +2173,16 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
 	case LOCK_USED_IN_RECLAIM_FS_READ:
-		return mark_lock_irq_used_in_read(curr, this, new_bit);
+	case LOCK_ENABLED_HARDIRQ_READ:
+	case LOCK_ENABLED_SOFTIRQ_READ:
+	case LOCK_ENABLED_RECLAIM_FS_READ:
+		return mark_lock_irq_read(curr, this, new_bit);
 
 	case LOCK_ENABLED_HARDIRQ:
 	case LOCK_ENABLED_SOFTIRQ:
 	case LOCK_ENABLED_RECLAIM_FS:
 		return mark_lock_irq_enabled(curr, this, new_bit);
 
-	case LOCK_ENABLED_HARDIRQ_READ:
-	case LOCK_ENABLED_SOFTIRQ_READ:
-	case LOCK_ENABLED_RECLAIM_FS_READ:
-		return mark_lock_irq_enabled_read(curr, this, new_bit);
-
 	default:
 		WARN_ON(1);
 		break;
-- 
cgit v1.1


From 42c50d544e009cd9b1a8e74d7c5ce8d03ca917ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 16:58:16 +0100
Subject: lockdep: merge the !_READ mark_lock_irq() helpers

These two are also remakably similar

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 58 +++++++++++++++-----------------------------------------
 1 file changed, 15 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2d95f9d..1ba52e6 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2057,31 +2057,39 @@ static int exclusive_bit(int new_bit)
 	return state | (dir ^ 2);
 }
 
+typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
+			     enum lock_usage_bit bit, const char *name);
+
 static int
-mark_lock_irq_used_in(struct task_struct *curr, struct held_lock *this,
+mark_lock_irq_write(struct task_struct *curr, struct held_lock *this,
 		      int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
 	int excl_bit = exclusive_bit(new_bit);
+	int dir = new_bit & 2;
+
+	check_usage_f usage = dir ?
+		check_usage_backwards : check_usage_forwards;
 
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 	if (!valid_state(curr, this, new_bit, excl_bit + 1))
 		return 0;
+
 	/*
 	 * just marked it hardirq-safe, check that this lock
 	 * took no hardirq-unsafe lock in the past:
 	 */
-	if (!check_usage_forwards(curr, this, excl_bit, name))
+	if (!usage(curr, this, excl_bit, name))
 		return 0;
 #if STRICT_READ_CHECKS
 	/*
 	 * just marked it hardirq-safe, check that this lock
 	 * took no hardirq-unsafe-read lock in the past:
 	 */
-	if (!check_usage_forwards(curr, this, excl_bit + 1, rname))
+	if (!usage(curr, this, excl_bit + 1, rname))
 		return 0;
 #endif
 	if (state_verbose(new_bit, hlock_class(this)))
@@ -2125,40 +2133,6 @@ mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int
-mark_lock_irq_enabled(struct task_struct *curr, struct held_lock *this,
-		      int new_bit)
-{
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
-	int excl_bit = exclusive_bit(new_bit);
-
-	if (!valid_state(curr, this, new_bit, excl_bit))
-		return 0;
-	if (!valid_state(curr, this, new_bit, excl_bit + 1))
-		return 0;
-	/*
-	 * just marked it hardirq-unsafe, check that no hardirq-safe
-	 * lock in the system ever took it in the past:
-	 */
-	if (!check_usage_backwards(curr, this, excl_bit, name))
-		return 0;
-#if STRICT_READ_CHECKS
-	/*
-	 * just marked it hardirq-unsafe, check that no
-	 * hardirq-safe-read lock in the system ever took
-	 * it in the past:
-	 */
-	if (!check_usage_backwards(curr, this, excl_bit + 1, rname))
-		return 0;
-#endif
-	if (state_verbose(new_bit, hlock_class(this)))
-		return 2;
-
-	return 1;
-}
-
 static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
@@ -2168,7 +2142,10 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_USED_IN_HARDIRQ:
 	case LOCK_USED_IN_SOFTIRQ:
 	case LOCK_USED_IN_RECLAIM_FS:
-		return mark_lock_irq_used_in(curr, this, new_bit);
+	case LOCK_ENABLED_HARDIRQ:
+	case LOCK_ENABLED_SOFTIRQ:
+	case LOCK_ENABLED_RECLAIM_FS:
+		return mark_lock_irq_write(curr, this, new_bit);
 
 	case LOCK_USED_IN_HARDIRQ_READ:
 	case LOCK_USED_IN_SOFTIRQ_READ:
@@ -2178,11 +2155,6 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 	case LOCK_ENABLED_RECLAIM_FS_READ:
 		return mark_lock_irq_read(curr, this, new_bit);
 
-	case LOCK_ENABLED_HARDIRQ:
-	case LOCK_ENABLED_SOFTIRQ:
-	case LOCK_ENABLED_RECLAIM_FS:
-		return mark_lock_irq_enabled(curr, this, new_bit);
-
 	default:
 		WARN_ON(1);
 		break;
-- 
cgit v1.1


From 9d3651a23dc1f7ed7d207f9118459d3a73d485a7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 17:18:32 +0100
Subject: lockdep: fully reduce mark_lock_irq()

Now what its only two functions, they again look rather similar.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 86 ++++++--------------------------------------------------
 1 file changed, 8 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1ba52e6..000d53a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2061,13 +2061,13 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
-mark_lock_irq_write(struct task_struct *curr, struct held_lock *this,
-		      int new_bit)
+mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 {
 	const char *name = state_name(new_bit);
 	const char *rname = state_rname(new_bit);
 
 	int excl_bit = exclusive_bit(new_bit);
+	int read = new_bit & 1;
 	int dir = new_bit & 2;
 
 	check_usage_f usage = dir ?
@@ -2075,57 +2075,17 @@ mark_lock_irq_write(struct task_struct *curr, struct held_lock *this,
 
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
-	if (!valid_state(curr, this, new_bit, excl_bit + 1))
-		return 0;
 
-	/*
-	 * just marked it hardirq-safe, check that this lock
-	 * took no hardirq-unsafe lock in the past:
-	 */
-	if (!usage(curr, this, excl_bit, name))
-		return 0;
-#if STRICT_READ_CHECKS
-	/*
-	 * just marked it hardirq-safe, check that this lock
-	 * took no hardirq-unsafe-read lock in the past:
-	 */
-	if (!usage(curr, this, excl_bit + 1, rname))
+	if (!read && !valid_state(curr, this, new_bit, excl_bit + 1))
 		return 0;
-#endif
-	if (state_verbose(new_bit, hlock_class(this)))
-		return 2;
-
-	return 1;
-}
-
-static int
-mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
-			   int new_bit)
-{
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
-	int excl_bit = exclusive_bit(new_bit);
-	int dir = new_bit & 2;
 
-	if (!valid_state(curr, this, new_bit, excl_bit))
+	if ((!read || (!dir || STRICT_READ_CHECKS)) &&
+			!usage(curr, this, excl_bit, name))
 		return 0;
 
-	if (!dir) {
-		/*
-		 * just marked it hardirq-read-safe, check that this lock
-		 * took no hardirq-unsafe lock in the past:
-		 */
-		if (!check_usage_forwards(curr, this, excl_bit, name))
-			return 0;
-	} else if (STRICT_READ_CHECKS) {
-		/*
-		 * just marked it hardirq-read-unsafe, check that no
-		 * hardirq-safe lock in the system ever took it in the past:
-		 */
-		if (!check_usage_backwards(curr, this, excl_bit, name))
-			return 0;
-	}
+	if ((!read && STRICT_READ_CHECKS) &&
+			!usage(curr, this, excl_bit + 1, rname))
+		return 0;
 
 	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
@@ -2133,36 +2093,6 @@ mark_lock_irq_read(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
-		enum lock_usage_bit new_bit)
-{
-	int ret = 1;
-
-	switch(new_bit) {
-	case LOCK_USED_IN_HARDIRQ:
-	case LOCK_USED_IN_SOFTIRQ:
-	case LOCK_USED_IN_RECLAIM_FS:
-	case LOCK_ENABLED_HARDIRQ:
-	case LOCK_ENABLED_SOFTIRQ:
-	case LOCK_ENABLED_RECLAIM_FS:
-		return mark_lock_irq_write(curr, this, new_bit);
-
-	case LOCK_USED_IN_HARDIRQ_READ:
-	case LOCK_USED_IN_SOFTIRQ_READ:
-	case LOCK_USED_IN_RECLAIM_FS_READ:
-	case LOCK_ENABLED_HARDIRQ_READ:
-	case LOCK_ENABLED_SOFTIRQ_READ:
-	case LOCK_ENABLED_RECLAIM_FS_READ:
-		return mark_lock_irq_read(curr, this, new_bit);
-
-	default:
-		WARN_ON(1);
-		break;
-	}
-
-	return ret;
-}
-
 enum mark_type {
 #define LOCKDEP_STATE(__STATE)	__STATE,
 #include "lockdep_states.h"
-- 
cgit v1.1


From cf2ad4d13c4ac6366c730fcf6c6be00db12fb75f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Jan 2009 13:58:08 +0100
Subject: lockdep: remove macro usage from mark_held_locks()

Now that we have nice numerical relations for the states, remove the macro
magics.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 000d53a..f40d916 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2099,14 +2099,6 @@ enum mark_type {
 #undef LOCKDEP_STATE
 };
 
-#define MARK_HELD_CASE(__STATE)						\
-	case __STATE:							\
-		if (hlock->read)					\
-			usage_bit = LOCK_ENABLED_##__STATE##_READ;	\
-		else							\
-			usage_bit = LOCK_ENABLED_##__STATE;		\
-		break;
-
 /*
  * Mark all held locks with a usage bit:
  */
@@ -2120,13 +2112,11 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		hlock = curr->held_locks + i;
 
-		switch (mark) {
-#define LOCKDEP_STATE(__STATE) MARK_HELD_CASE(__STATE)
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-		default:
-			BUG();
-		}
+		usage_bit = 2 + (mark << 2); /* ENABLED */
+		if (hlock->read)
+			usage_bit += 1; /* READ */
+
+		BUG_ON(usage_bit >= LOCK_USAGE_STATES);
 
 		if (!mark_lock(curr, hlock, usage_bit))
 			return 0;
-- 
cgit v1.1


From 38aa2714382d886f77f2565277fce293122808b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Jan 2009 14:53:50 +0100
Subject: lockdep: add comments to mark_lock_irq()

re-add some of the comments that got lost in the refactoring.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f40d916..02e6e06 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2054,6 +2054,9 @@ static int exclusive_bit(int new_bit)
 	int state = new_bit & ~3;
 	int dir = new_bit & 2;
 
+	/*
+	 * keep state, bit flip the direction and strip read.
+	 */
 	return state | (dir ^ 2);
 }
 
@@ -2070,22 +2073,42 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	int read = new_bit & 1;
 	int dir = new_bit & 2;
 
+	/*
+	 * mark USED_IN has to look forwards -- to ensure no dependency
+	 * has ENABLED state, which would allow recursion deadlocks.
+	 *
+	 * mark ENABLED has to look backwards -- to ensure no dependee
+	 * has USED_IN state, which, again, would allow  recursion deadlocks.
+	 */
 	check_usage_f usage = dir ?
 		check_usage_backwards : check_usage_forwards;
 
+	/*
+	 * Validate that this particular lock does not have conflicting
+	 * usage states.
+	 */
 	if (!valid_state(curr, this, new_bit, excl_bit))
 		return 0;
 
-	if (!read && !valid_state(curr, this, new_bit, excl_bit + 1))
-		return 0;
-
-	if ((!read || (!dir || STRICT_READ_CHECKS)) &&
+	/*
+	 * Validate that the lock dependencies don't have conflicting usage
+	 * states.
+	 */
+	if ((!read || !dir || STRICT_READ_CHECKS) &&
 			!usage(curr, this, excl_bit, name))
 		return 0;
 
-	if ((!read && STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit + 1, rname))
-		return 0;
+	/*
+	 * Check for read in write conflicts
+	 */
+	if (!read) {
+		if (!valid_state(curr, this, new_bit, excl_bit + 1))
+			return 0;
+
+		if (STRICT_READ_CHECKS &&
+				!usage(curr, this, excl_bit + 1, rname))
+			return 0;
+	}
 
 	if (state_verbose(new_bit, hlock_class(this)))
 		return 2;
-- 
cgit v1.1


From 3ff176ca47911630d1555f150d36daa2d0819ea9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 17:40:42 +0100
Subject: lockdep: simplify get_user_chars()

there's too much repetition of code..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 65 +++++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02e6e06..1b4ee3c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -467,54 +467,37 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 	return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
 }
 
-void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
-					char *c4, char *c5, char *c6)
+static inline unsigned long lock_flag(enum lock_usage_bit bit)
 {
-	*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.', *c5 = '.', *c6 = '.';
-
-	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
-		*c1 = '+';
-	else
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
-			*c1 = '-';
-
-	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
-		*c2 = '+';
-	else
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
-			*c2 = '-';
+	return 1UL << bit;
+}
 
-	if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
-		*c3 = '-';
-	if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
-		*c3 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
-			*c3 = '?';
-	}
+static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
+{
+	char c = '.';
 
-	if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
-		*c4 = '-';
-	if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
-		*c4 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
-			*c4 = '?';
+	if (class->usage_mask & lock_flag(bit + 2))
+		c = '+';
+	if (class->usage_mask & lock_flag(bit)) {
+		c = '-';
+		if (class->usage_mask & lock_flag(bit + 2))
+			c = '?';
 	}
 
-	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS)
-		*c5 = '+';
-	else
-		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS)
-			*c5 = '-';
+	return c;
+}
 
-	if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
-		*c6 = '-';
-	if (class->usage_mask & LOCKF_USED_IN_RECLAIM_FS_READ) {
-		*c6 = '+';
-		if (class->usage_mask & LOCKF_ENABLED_RECLAIM_FS_READ)
-			*c6 = '?';
-	}
+void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
+					char *c4, char *c5, char *c6)
+{
+	*c1 = get_usage_char(class, LOCK_USED_IN_HARDIRQ);
+	*c2 = get_usage_char(class, LOCK_USED_IN_SOFTITQ);
+	*c3 = get_usage_char(class, LOCK_USED_IN_HARDIRQ_READ);
+	*c4 = get_usage_char(class, LOCK_USED_IN_SOFTITQ_READ);
 
+	*c5 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS);
+	*c6 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS_READ);
 }
 
 static void print_lock_name(struct lock_class *class)
-- 
cgit v1.1


From f510b233cfc7bfd57b6007071c52aa42e3d16b06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 17:53:47 +0100
Subject: lockdep: get_user_chars() redo

Generic, states independent, get_user_chars().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 24 ++++++++++++------------
 kernel/lockdep_internals.h |  7 ++++---
 kernel/lockdep_proc.c      |  6 +++---
 3 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1b4ee3c..22ced8d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -487,25 +487,25 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
 	return c;
 }
 
-void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
-					char *c4, char *c5, char *c6)
+void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
 {
-	*c1 = get_usage_char(class, LOCK_USED_IN_HARDIRQ);
-	*c2 = get_usage_char(class, LOCK_USED_IN_SOFTITQ);
-	*c3 = get_usage_char(class, LOCK_USED_IN_HARDIRQ_READ);
-	*c4 = get_usage_char(class, LOCK_USED_IN_SOFTITQ_READ);
+	int i = 0;
 
-	*c5 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS);
-	*c6 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS_READ);
+#define LOCKDEP_STATE(__STATE) 						\
+	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE);	\
+	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+	usage[i] = '\0';
 }
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6;
+	char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
 	const char *name;
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
+	get_usage_chars(class, usage);
 
 	name = class->name;
 	if (!name) {
@@ -518,7 +518,7 @@ static void print_lock_name(struct lock_class *class)
 		if (class->subclass)
 			printk("/%d", class->subclass);
 	}
-	printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6);
+	printk("){%s}", usage);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 7e653e6..a2cc7e9 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -70,9 +70,10 @@ enum {
 extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
 
-extern void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3,
-					char *c4, char *c5, char *c6);
+#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+
+extern void get_usage_chars(struct lock_class *class,
+			    char usage[LOCK_USAGE_CHARS]);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index bd474fd..b51064c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
 {
 	struct lock_class *class = v;
 	struct lock_list *entry;
-	char c1, c2, c3, c4, c5, c6;
+	char usage[LOCK_USAGE_CHARS];
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
 	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
 #endif
 
-	get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6);
-	seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6);
+	get_usage_chars(class, usage);
+	seq_printf(m, " %s", usage);
 
 	seq_printf(m, ": ");
 	print_name(m, class);
-- 
cgit v1.1


From 4f367d8adca947bed4385740a13d1efb1a06fba1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Jan 2009 18:10:42 +0100
Subject: lockdep: simplify check_prev_add_irq()

Remove the manual state iteration thingy.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 154 ++++++++++++++++++++++---------------------------------
 1 file changed, 61 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 22ced8d..42dfc28 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1268,68 +1268,84 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 			bit_backwards, bit_forwards, irqclass);
 }
 
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
-		struct held_lock *next)
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+	STR(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
 {
-	/*
-	 * Prove that the new dependency does not connect a hardirq-safe
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
-					LOCK_ENABLED_HARDIRQ, "hard"))
-		return 0;
+	return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+}
 
+static int exclusive_bit(int new_bit)
+{
 	/*
-	 * Prove that the new dependency does not connect a hardirq-safe-read
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
+	 * USED_IN
+	 * USED_IN_READ
+	 * ENABLED
+	 * ENABLED_READ
+	 *
+	 * bit 0 - write/read
+	 * bit 1 - used_in/enabled
+	 * bit 2+  state
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
-					LOCK_ENABLED_HARDIRQ, "hard-read"))
-		return 0;
+
+	int state = new_bit & ~3;
+	int dir = new_bit & 2;
 
 	/*
-	 * Prove that the new dependency does not connect a softirq-safe
-	 * lock with a softirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
+	 * keep state, bit flip the direction and strip read.
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
-					LOCK_ENABLED_SOFTIRQ, "soft"))
-		return 0;
+	return state | (dir ^ 2);
+}
+
+static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
+			   struct held_lock *next, enum lock_usage_bit bit)
+{
 	/*
-	 * Prove that the new dependency does not connect a softirq-safe-read
-	 * lock with a softirq-unsafe lock - to achieve this we search
+	 * Prove that the new dependency does not connect a hardirq-safe
+	 * lock with a hardirq-unsafe lock - to achieve this we search
 	 * the backwards-subgraph starting at <prev>, and the
 	 * forwards-subgraph starting at <next>:
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
-					LOCK_ENABLED_SOFTIRQ, "soft"))
+	if (!check_usage(curr, prev, next, bit,
+			   exclusive_bit(bit), state_name(bit)))
 		return 0;
 
+	bit++; /* _READ */
+
 	/*
-	 * Prove that the new dependency does not connect a reclaim-fs-safe
-	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
+	 * Prove that the new dependency does not connect a hardirq-safe-read
+	 * lock with a hardirq-unsafe lock - to achieve this we search
 	 * the backwards-subgraph starting at <prev>, and the
 	 * forwards-subgraph starting at <next>:
 	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS,
-					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs"))
+	if (!check_usage(curr, prev, next, bit,
+			   exclusive_bit(bit), state_name(bit)))
 		return 0;
 
-	/*
-	 * Prove that the new dependency does not connect a reclaim-fs-safe-read
-	 * lock with a reclaim-fs-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
-	 */
-	if (!check_usage(curr, prev, next, LOCK_USED_IN_RECLAIM_FS_READ,
-					LOCK_ENABLED_RECLAIM_FS, "reclaim-fs-read"))
+	return 1;
+}
+
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+		struct held_lock *next)
+{
+#define LOCKDEP_STATE(__STATE)						\
+	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\
 		return 0;
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
 
 	return 1;
 }
@@ -1984,30 +2000,6 @@ static int RECLAIM_FS_verbose(struct lock_class *class)
 
 #define STRICT_READ_CHECKS	1
 
-static const char *state_names[] = {
-#define LOCKDEP_STATE(__STATE) \
-	STR(__STATE),
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
-static inline const char *state_name(enum lock_usage_bit bit)
-{
-	return state_names[bit >> 2];
-}
-
-static const char *state_rnames[] = {
-#define LOCKDEP_STATE(__STATE) \
-	STR(__STATE)"-READ",
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
-static inline const char *state_rname(enum lock_usage_bit bit)
-{
-	return state_rnames[bit >> 2];
-}
-
 static int (*state_verbose_f[])(struct lock_class *class) = {
 #define LOCKDEP_STATE(__STATE) \
 	__STATE##_verbose,
@@ -2021,37 +2013,12 @@ static inline int state_verbose(enum lock_usage_bit bit,
 	return state_verbose_f[bit >> 2](class);
 }
 
-static int exclusive_bit(int new_bit)
-{
-	/*
-	 * USED_IN
-	 * USED_IN_READ
-	 * ENABLED
-	 * ENABLED_READ
-	 *
-	 * bit 0 - write/read
-	 * bit 1 - used_in/enabled
-	 * bit 2+  state
-	 */
-
-	int state = new_bit & ~3;
-	int dir = new_bit & 2;
-
-	/*
-	 * keep state, bit flip the direction and strip read.
-	 */
-	return state | (dir ^ 2);
-}
-
 typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
 mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 {
-	const char *name = state_name(new_bit);
-	const char *rname = state_rname(new_bit);
-
 	int excl_bit = exclusive_bit(new_bit);
 	int read = new_bit & 1;
 	int dir = new_bit & 2;
@@ -2078,7 +2045,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	 * states.
 	 */
 	if ((!read || !dir || STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit, name))
+			!usage(curr, this, excl_bit, state_name(new_bit)))
 		return 0;
 
 	/*
@@ -2089,7 +2056,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 			return 0;
 
 		if (STRICT_READ_CHECKS &&
-				!usage(curr, this, excl_bit + 1, rname))
+			!usage(curr, this, excl_bit + 1,
+				state_name(new_bit + 1)))
 			return 0;
 	}
 
-- 
cgit v1.1


From b4b136f44b3b7adb9265fd5566d0ea9b99b1cd5f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 29 Jan 2009 14:50:36 +0100
Subject: lockdep: use stringify.h

Arnd pointed out we have the stringify macro magic already in-kernel.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42dfc28..fc84a30 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -41,6 +41,7 @@
 #include <linux/utsname.h>
 #include <linux/hash.h>
 #include <linux/ftrace.h>
+#include <linux/stringify.h>
 
 #include <asm/sections.h>
 
@@ -445,14 +446,11 @@ atomic_t nr_find_usage_backwards_recursions;
  * Locking printouts:
  */
 
-#define __STR(foo)	#foo
-#define STR(foo)	__STR(foo)
-
 #define __USAGE(__STATE)						\
-	[LOCK_USED_IN_##__STATE] = "IN-"STR(__STATE)"-W",		\
-	[LOCK_ENABLED_##__STATE] = STR(__STATE)"-ON-W",			\
-	[LOCK_USED_IN_##__STATE##_READ] = "IN-"STR(__STATE)"-R",	\
-	[LOCK_ENABLED_##__STATE##_READ] = STR(__STATE)"-ON-R",
+	[LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W",	\
+	[LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W",		\
+	[LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
+	[LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
 
 static const char *usage_str[] =
 {
@@ -1270,14 +1268,14 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 
 static const char *state_names[] = {
 #define LOCKDEP_STATE(__STATE) \
-	STR(__STATE),
+	__stringify(__STATE),
 #include "lockdep_states.h"
 #undef LOCKDEP_STATE
 };
 
 static const char *state_rnames[] = {
 #define LOCKDEP_STATE(__STATE) \
-	STR(__STATE)"-READ",
+	__stringify(__STATE)"-READ",
 #include "lockdep_states.h"
 #undef LOCKDEP_STATE
 };
-- 
cgit v1.1


From 9833f8cb952b9aa3f98a71e7bef8820cee3261a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 14 Feb 2009 16:59:04 +0100
Subject: lockstat: warn about disabled lock debugging

Avoid confusion and clearly state lock debugging got disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b51064c..d7135aa 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 static void seq_header(struct seq_file *m)
 {
 	seq_printf(m, "lock_stat version 0.3\n");
+
+	if (unlikely(!debug_locks))
+		seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
+
 	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
 			"%14s %14s\n",
-- 
cgit v1.1


From 868a23a8043f2a3042dae60105c89bd4680187ba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 15 Feb 2009 00:25:21 +0100
Subject: lockdep: build fix for !PROVE_LOCKING

The __GFP_FS annotations fail to build with CONFIG_LOCKDEP=y,
CONFIG_PROVE_LOCKING=n, ammend that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index fc84a30..022d2ed 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2396,6 +2396,10 @@ static inline int separate_irq_context(struct task_struct *curr,
 	return 0;
 }
 
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+}
+
 #endif
 
 /*
-- 
cgit v1.1


From a234aa9ecdf47a5461573a21dc0b154278df5ba8 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Sat, 14 Feb 2009 09:36:00 +0600
Subject: tracing: fix section mismatch in trace_hw_branches.c

The function bts_trace_init() references a variable
bts_hotcpu_notifier which is marked
as __cpuinitdata. Thus causes section mismatch. This patch fixes it.

   LD      kernel/trace/built-in.o
 WARNING: kernel/trace/built-in.o(.text+0xc90c): Section mismatch in
 reference from the function bts_trace_init() to the variable
 .cpuinit.data:bts_hotcpu_notifier
 The function bts_trace_init() references
 the variable __cpuinitdata bts_hotcpu_notifier.
 This is often because bts_trace_init lacks a __cpuinitdata
 annotation or the annotation of bts_hotcpu_notifier is wrong.

 WARNING: kernel/trace/built-in.o(.text+0xc92a): Section mismatch in
 reference from the function bts_trace_reset() to the variable
 .cpuinit.data:bts_hotcpu_notifier
 The function bts_trace_reset() references
 the variable __cpuinitdata bts_hotcpu_notifier.
 This is often because bts_trace_reset lacks a __cpuinitdata
 annotation or the annotation of bts_hotcpu_notifier is wrong.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: markus.t.metzger@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 0794dd3..3561aac 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -127,7 +127,7 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
+static int __cpuinit bts_trace_init(struct trace_array *tr)
 {
 	hw_branch_trace = tr;
 
@@ -137,7 +137,7 @@ static int bts_trace_init(struct trace_array *tr)
 	return 0;
 }
 
-static void bts_trace_reset(struct trace_array *tr)
+static void __cpuinit bts_trace_reset(struct trace_array *tr)
 {
 	bts_trace_stop(tr);
 	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
-- 
cgit v1.1


From 0c75a3ed633419d75d823d5dcb05d42924c6ae61 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 11:21:52 -0500
Subject: ftrace: state that all functions are enabled in set_ftrace_filter

Impact: clean up, make set_ftrace_filter less confusing

The set_ftrace_filter shows only the functions that will be traced.
But when it is empty, it will trace all functions. This can be a bit
confusing.

This patch makes set_ftrace_filter show:

  #### all functions enabled ####

When all functions will be traced, and we do not filter only a select
few.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1796e01..369fb78 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -773,6 +773,7 @@ enum {
 	FTRACE_ITER_CONT	= (1 << 1),
 	FTRACE_ITER_NOTRACE	= (1 << 2),
 	FTRACE_ITER_FAILURES	= (1 << 3),
+	FTRACE_ITER_PRINTALL	= (1 << 4),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -794,6 +795,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
+	if (iter->flags & FTRACE_ITER_PRINTALL)
+		return NULL;
+
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
  retry:
@@ -834,6 +838,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 
+	/*
+	 * For set_ftrace_filter reading, if we have the filter
+	 * off, we can short cut and just print out that all
+	 * functions are enabled.
+	 */
+	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+		if (*pos > 0)
+			return NULL;
+		iter->flags |= FTRACE_ITER_PRINTALL;
+		(*pos)++;
+		return iter;
+	}
+
 	if (*pos > 0) {
 		if (iter->idx < 0)
 			return p;
@@ -852,9 +869,15 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
+	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
+	if (iter->flags & FTRACE_ITER_PRINTALL) {
+		seq_printf(m, "#### all functions enabled ####\n");
+		return 0;
+	}
+
 	if (!rec)
 		return 0;
 
-- 
cgit v1.1


From 265c831cb03d533cbe159af45798ac9fef534260 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 12:43:56 -0500
Subject: ftrace: add do_for_each_ftrace_rec and while_for_each_ftrace_rec

Impact: clean up

To iterate over all the functions that dynamic trace knows about
it requires two for loops. One to iterate over the pages and the
other to iterate over the records within the page.

There are several duplications of these loops in ftrace.c. This
patch creates the macros do_for_each_ftrace_rec and
while_for_each_ftrace_rec to handle this logic, and removes the
duplicate code.

While making this change, I also discovered and fixed a small
bug that one of the iterations should exit the loop after it found the
record it was searching for. This used a break when it should have
used a goto, since there were two loops it needed to break out
from.  No real harm was done by this bug since it would only continue
to search the other records, and the code was in a slow path anyway.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 208 ++++++++++++++++++++++++--------------------------
 1 file changed, 101 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 369fb78..fed1ebc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -297,6 +297,19 @@ static struct ftrace_page	*ftrace_pages;
 
 static struct dyn_ftrace *ftrace_free_records;
 
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
 
 #ifdef CONFIG_KPROBES
 
@@ -341,7 +354,6 @@ void ftrace_release(void *start, unsigned long size)
 	struct ftrace_page *pg;
 	unsigned long s = (unsigned long)start;
 	unsigned long e = s + size;
-	int i;
 
 	if (ftrace_disabled || !start)
 		return;
@@ -349,14 +361,11 @@ void ftrace_release(void *start, unsigned long size)
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e))
+			ftrace_free_rec(rec);
+	} while_for_each_ftrace_rec();
 
-			if ((rec->ip >= s) && (rec->ip < e))
-				ftrace_free_rec(rec);
-		}
-	}
 	spin_unlock(&ftrace_lock);
 }
 
@@ -523,41 +532,37 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 static void ftrace_replace_code(int enable)
 {
-	int i, failed;
+	int failed;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-
-			/*
-			 * Skip over free records and records that have
-			 * failed.
-			 */
-			if (rec->flags & FTRACE_FL_FREE ||
-			    rec->flags & FTRACE_FL_FAILED)
-				continue;
-
-			/* ignore updates to this record's mcount site */
-			if (get_kprobe((void *)rec->ip)) {
-				freeze_record(rec);
-				continue;
-			} else {
-				unfreeze_record(rec);
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+		/*
+		 * Skip over free records and records that have
+		 * failed.
+		 */
+		if (rec->flags & FTRACE_FL_FREE ||
+		    rec->flags & FTRACE_FL_FAILED)
+			continue;
 
-			failed = __ftrace_replace_code(rec, enable);
-			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
-				rec->flags |= FTRACE_FL_FAILED;
-				if ((system_state == SYSTEM_BOOTING) ||
-				    !core_kernel_text(rec->ip)) {
-					ftrace_free_rec(rec);
-				} else
-					ftrace_bug(failed, rec->ip);
-			}
+		/* ignore updates to this record's mcount site */
+		if (get_kprobe((void *)rec->ip)) {
+			freeze_record(rec);
+			continue;
+		} else {
+			unfreeze_record(rec);
 		}
-	}
+
+		failed = __ftrace_replace_code(rec, enable);
+		if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			if ((system_state == SYSTEM_BOOTING) ||
+			    !core_kernel_text(rec->ip)) {
+				ftrace_free_rec(rec);
+			} else
+				ftrace_bug(failed, rec->ip);
+		}
+	} while_for_each_ftrace_rec();
 }
 
 static int
@@ -956,22 +961,17 @@ static void ftrace_filter_reset(int enable)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned i;
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 0;
-	pg = ftrace_pages_start;
-	while (pg) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-			if (rec->flags & FTRACE_FL_FAILED)
-				continue;
-			rec->flags &= ~type;
-		}
-		pg = pg->next;
-	}
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+		rec->flags &= ~type;
+	} while_for_each_ftrace_rec();
+
 	spin_unlock(&ftrace_lock);
 }
 
@@ -1094,44 +1094,39 @@ ftrace_match(unsigned char *buff, int len, int enable)
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 1;
-	pg = ftrace_pages_start;
-	while (pg) {
-		for (i = 0; i < pg->index; i++) {
-			int matched = 0;
-			char *ptr;
-
-			rec = &pg->records[i];
-			if (rec->flags & FTRACE_FL_FAILED)
-				continue;
-			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-			switch (type) {
-			case MATCH_FULL:
-				if (strcmp(str, buff) == 0)
-					matched = 1;
-				break;
-			case MATCH_FRONT_ONLY:
-				if (memcmp(str, buff, match) == 0)
-					matched = 1;
-				break;
-			case MATCH_MIDDLE_ONLY:
-				if (strstr(str, search))
-					matched = 1;
-				break;
-			case MATCH_END_ONLY:
-				ptr = strstr(str, search);
-				if (ptr && (ptr[search_len] == 0))
-					matched = 1;
-				break;
-			}
-			if (matched) {
-				if (not)
-					rec->flags &= ~flag;
-				else
-					rec->flags |= flag;
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+		int matched = 0;
+		char *ptr;
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+		switch (type) {
+		case MATCH_FULL:
+			if (strcmp(str, buff) == 0)
+				matched = 1;
+			break;
+		case MATCH_FRONT_ONLY:
+			if (memcmp(str, buff, match) == 0)
+				matched = 1;
+			break;
+		case MATCH_MIDDLE_ONLY:
+			if (strstr(str, search))
+				matched = 1;
+			break;
+		case MATCH_END_ONLY:
+			ptr = strstr(str, search);
+			if (ptr && (ptr[search_len] == 0))
+				matched = 1;
+			break;
 		}
-		pg = pg->next;
-	}
+		if (matched) {
+			if (not)
+				rec->flags &= ~flag;
+			else
+				rec->flags |= flag;
+		}
+	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
 
@@ -1452,7 +1447,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 	int found = 0;
-	int i, j;
+	int j;
 
 	if (ftrace_disabled)
 		return -ENODEV;
@@ -1460,27 +1455,26 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-
-			if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
-				continue;
-
-			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-			if (strcmp(str, buffer) == 0) {
-				found = 1;
-				for (j = 0; j < idx; j++)
-					if (array[j] == rec->ip) {
-						found = 0;
-						break;
-					}
-				if (found)
-					array[idx] = rec->ip;
-				break;
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+			continue;
+
+		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+		if (strcmp(str, buffer) == 0) {
+			/* Return 1 if we add it to the array */
+			found = 1;
+			for (j = 0; j < idx; j++)
+				if (array[j] == rec->ip) {
+					found = 0;
+					break;
+				}
+			if (found)
+				array[idx] = rec->ip;
+			goto out;
 		}
-	}
+	} while_for_each_ftrace_rec();
+ out:
 	spin_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
-- 
cgit v1.1


From 7f24b31b01a271b62346d9df084b029e48612163 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 14:37:33 -0500
Subject: ftrace: rename ftrace_match to ftrace_match_records

Impact: clean up

ftrace_match is too generic of a name. What it really does is
search all records and matches the records with the given string,
and either sets or unsets the functions to be traced depending
on if the parameter 'enable' is set or not.

This allows us to make another function called ftrace_match that
can be used to test a single record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fed1ebc..f397d7a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1054,7 +1054,7 @@ enum {
 };
 
 static void
-ftrace_match(unsigned char *buff, int len, int enable)
+ftrace_match_records(unsigned char *buff, int len, int enable)
 {
 	char str[KSYM_SYMBOL_LEN];
 	char *search = NULL;
@@ -1197,7 +1197,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
@@ -1236,7 +1236,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
 	if (reset)
 		ftrace_filter_reset(enable);
 	if (buf)
-		ftrace_match(buf, len, enable);
+		ftrace_match_records(buf, len, enable);
 	mutex_unlock(&ftrace_regex_lock);
 }
 
@@ -1286,7 +1286,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 	if (iter->buffer_idx) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 	}
 
 	mutex_lock(&ftrace_sysctl_lock);
-- 
cgit v1.1


From 9f4801e30ad291e27284e873696da1ead92d68fa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 15:56:43 -0500
Subject: ftrace: break up ftrace_match_records into smaller components

Impact: clean up

ftrace_match_records does a lot of things that other features
can use. This patch breaks up ftrace_match_records and pulls
out ftrace_setup_glob and ftrace_match_record.

ftrace_setup_glob prepares a simple glob expression for use with
ftrace_match_record. ftrace_match_record compares a single record
with a glob type.

Breaking this up will allow for more features to run on individual
records.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 115 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f397d7a..fcec313 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1053,79 +1053,114 @@ enum {
 	MATCH_END_ONLY,
 };
 
-static void
-ftrace_match_records(unsigned char *buff, int len, int enable)
+/*
+ * (static function - no need for kernel doc)
+ *
+ * Pass in a buffer containing a glob and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ *  search returns the pointer to use for comparison.
+ *  not returns 1 if buff started with a '!'
+ *     0 otherwise.
+ */
+static int
+ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
 {
-	char str[KSYM_SYMBOL_LEN];
-	char *search = NULL;
-	struct ftrace_page *pg;
-	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned i, match = 0, search_len = 0;
-	int not = 0;
+	int i;
 
 	if (buff[0] == '!') {
-		not = 1;
+		*not = 1;
 		buff++;
 		len--;
-	}
+	} else
+		*not = 0;
+
+	*search = buff;
 
 	for (i = 0; i < len; i++) {
 		if (buff[i] == '*') {
 			if (!i) {
-				search = buff + i + 1;
+				*search = buff + 1;
 				type = MATCH_END_ONLY;
-				search_len = len - (i + 1);
 			} else {
-				if (type == MATCH_END_ONLY) {
+				if (type == MATCH_END_ONLY)
 					type = MATCH_MIDDLE_ONLY;
-				} else {
-					match = i;
+				else
 					type = MATCH_FRONT_ONLY;
-				}
 				buff[i] = 0;
 				break;
 			}
 		}
 	}
 
+	return type;
+}
+
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+	int matched = 0;
+	char *ptr;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	switch (type) {
+	case MATCH_FULL:
+		if (strcmp(str, regex) == 0)
+			matched = 1;
+		break;
+	case MATCH_FRONT_ONLY:
+		if (strncmp(str, regex, len) == 0)
+			matched = 1;
+		break;
+	case MATCH_MIDDLE_ONLY:
+		if (strstr(str, regex))
+			matched = 1;
+		break;
+	case MATCH_END_ONLY:
+		ptr = strstr(str, regex);
+		if (ptr && (ptr[len] == 0))
+			matched = 1;
+		break;
+	}
+
+	return matched;
+}
+
+static void ftrace_match_records(char *buff, int len, int enable)
+{
+	char *search;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned search_len;
+	int not;
+
+	type = ftrace_setup_glob(buff, len, &search, &not);
+
+	search_len = strlen(search);
+
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 1;
 	do_for_each_ftrace_rec(pg, rec) {
-		int matched = 0;
-		char *ptr;
 
 		if (rec->flags & FTRACE_FL_FAILED)
 			continue;
-		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-		switch (type) {
-		case MATCH_FULL:
-			if (strcmp(str, buff) == 0)
-				matched = 1;
-			break;
-		case MATCH_FRONT_ONLY:
-			if (memcmp(str, buff, match) == 0)
-				matched = 1;
-			break;
-		case MATCH_MIDDLE_ONLY:
-			if (strstr(str, search))
-				matched = 1;
-			break;
-		case MATCH_END_ONLY:
-			ptr = strstr(str, search);
-			if (ptr && (ptr[search_len] == 0))
-				matched = 1;
-			break;
-		}
-		if (matched) {
+
+		if (ftrace_match_record(rec, search, search_len, type)) {
 			if (not)
 				rec->flags &= ~flag;
 			else
 				rec->flags |= flag;
 		}
+
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
-- 
cgit v1.1


From 64e7c440618998fd69eee6ab490b042d12248021 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 17:08:48 -0500
Subject: ftrace: add module command function filter selection

This patch adds a "command" syntax to the function filtering files:

  /debugfs/tracing/set_ftrace_filter
  /debugfs/tracing/set_ftrace_notrace

Of the format:  <function>:<command>:<parameter>

The command is optional, and dependent on the command, so are
the parameters.

 echo do_fork > set_ftrace_filter

Will only trace 'do_fork'.

 echo 'sched_*' > set_ftrace_filter

Will only trace functions starting with the letters 'sched_'.

 echo '*:mod:ext3' > set_ftrace_filter

Will trace only the ext3 module functions.

 echo '*write*:mod:ext3' > set_ftrace_notrace

Will prevent the ext3 functions with the letters 'write' in
the name from being traced.

 echo '!*_allocate:mod:ext3' > set_ftrace_filter

Will remove the functions in ext3 that end with the letters
'_allocate' from the ftrace filter.

Although this patch implements the 'command' format, only the
'mod' command is supported. More commands to follow.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fcec313..9e60ae4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1067,7 +1067,7 @@ enum {
  *     0 otherwise.
  */
 static int
-ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
+ftrace_setup_glob(char *buff, int len, char **search, int *not)
 {
 	int type = MATCH_FULL;
 	int i;
@@ -1100,14 +1100,11 @@ ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
 	return type;
 }
 
-static int
-ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+static int ftrace_match(char *str, char *regex, int len, int type)
 {
-	char str[KSYM_SYMBOL_LEN];
 	int matched = 0;
 	char *ptr;
 
-	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	switch (type) {
 	case MATCH_FULL:
 		if (strcmp(str, regex) == 0)
@@ -1131,6 +1128,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
 	return matched;
 }
 
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	return ftrace_match(str, regex, len, type);
+}
+
 static void ftrace_match_records(char *buff, int len, int enable)
 {
 	char *search;
@@ -1165,6 +1171,100 @@ static void ftrace_match_records(char *buff, int len, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+static int
+ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
+			   char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+
+	if (!modname || strcmp(modname, mod))
+		return 0;
+
+	/* blank search means to match all funcs in the mod */
+	if (len)
+		return ftrace_match(str, regex, len, type);
+	else
+		return 1;
+}
+
+static void ftrace_match_module_records(char *buff, char *mod, int enable)
+{
+	char *search = buff;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned search_len = 0;
+	int not = 0;
+
+	/* blank or '*' mean the same */
+	if (strcmp(buff, "*") == 0)
+		buff[0] = 0;
+
+	/* handle the case of 'dont filter this module' */
+	if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
+		buff[0] = 0;
+		not = 1;
+	}
+
+	if (strlen(buff)) {
+		type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+		search_len = strlen(search);
+	}
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+	if (enable)
+		ftrace_filtered = 1;
+
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (ftrace_match_module_record(rec, mod,
+					       search, search_len, type)) {
+			if (not)
+				rec->flags &= ~flag;
+			else
+				rec->flags |= flag;
+		}
+
+	} while_for_each_ftrace_rec();
+	spin_unlock(&ftrace_lock);
+}
+
+static int ftrace_process_regex(char *buff, int len, int enable)
+{
+	char *func, *mod, *command, *next = buff;
+
+	func = strsep(&next, ":");
+
+	if (!next) {
+		ftrace_match_records(func, len, enable);
+		return 0;
+	}
+
+	/* command fonud */
+
+	command = strsep(&next, ":");
+
+	if (strcmp(command, "mod") == 0) {
+		/* only match modules */
+		if (!next)
+			return -EINVAL;
+
+		mod = strsep(&next, ":");
+		ftrace_match_module_records(func, mod, enable);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static ssize_t
 ftrace_regex_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos, int enable)
@@ -1232,7 +1332,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
+		ret = ftrace_process_regex(iter->buffer,
+					   iter->buffer_idx, enable);
+		if (ret)
+			goto out;
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
-- 
cgit v1.1


From e68746a271eb3393a2183840be9e903caddf765b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 20:53:42 -0500
Subject: ftrace: enable filtering only when a function is filtered on

Impact: fix to prevent empty set_ftrace_filter and no ftrace output

The function filter is used to only trace a given set of functions.
The filter is enabled when a function name is echoed into the
set_ftrace_filter file. But if the name has a typo and the function
is not found, the filter is enabled, but no function is listed.

This makes a confusing situation where set_ftrace_filter is empty
but no functions ever get enabled for tracing.

For example:

 # cat /debug/tracing/set_ftrace_filter

  #### all functions enabled ####

 # echo bad_name > set_ftrace_filter
 # cat /debug/tracing/set_ftrace_filter

 # echo function > current_tracer
 # cat trace

  # tracer: nop
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
  #              | |       |          |         |

This patch changes that to only enable filtering if a function
is set to be filtered on. Now, the filter is not enabled if
a bad name is echoed into set_ftrace_filter.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e60ae4..340f88b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1153,8 +1153,6 @@ static void ftrace_match_records(char *buff, int len, int enable)
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
-	if (enable)
-		ftrace_filtered = 1;
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1166,7 +1164,12 @@ static void ftrace_match_records(char *buff, int len, int enable)
 			else
 				rec->flags |= flag;
 		}
-
+		/*
+		 * Only enable filtering if we have a function that
+		 * is filtered on.
+		 */
+		if (enable && (rec->flags & FTRACE_FL_FILTER))
+			ftrace_filtered = 1;
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
@@ -1217,9 +1220,6 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
-	if (enable)
-		ftrace_filtered = 1;
-
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1232,6 +1232,8 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 			else
 				rec->flags |= flag;
 		}
+		if (enable && (rec->flags & FTRACE_FL_FILTER))
+			ftrace_filtered = 1;
 
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
-- 
cgit v1.1


From f6180773d90595650e11de0118bb112018290915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 00:40:25 -0500
Subject: ftrace: add command interface for function selection

Allow for other tracers to add their own commands for function
selection. This interface gives a trace the ability to name a
command for function selection. Right now it is pretty limited
in what it offers, but this is a building step for more features.

The :mod: command is converted to this interface and also serves
as a template for other implementations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 106 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 95 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 340f88b..45a44c4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1239,9 +1239,93 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+{
+	char *mod;
+
+	/*
+	 * cmd == 'mod' because we only registered this func
+	 * for the 'mod' ftrace_func_command.
+	 * But if you register one func with multiple commands,
+	 * you can tell which command was used by the cmd
+	 * parameter.
+	 */
+
+	/* we must have a module name */
+	if (!param)
+		return -EINVAL;
+
+	mod = strsep(&param, ":");
+	if (!strlen(mod))
+		return -EINVAL;
+
+	ftrace_match_module_records(func, mod, enable);
+	return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+	.name			= "mod",
+	.func			= ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+	return register_ftrace_command(&ftrace_mod_cmd);
+}
+device_initcall(ftrace_mod_cmd_init);
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p;
+	int ret = 0;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+	}
+	list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p, *n;
+	int ret = -ENODEV;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = 0;
+			list_del_init(&p->list);
+			goto out_unlock;
+		}
+	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	char *func, *mod, *command, *next = buff;
+	struct ftrace_func_command *p;
+	char *func, *command, *next = buff;
+	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
 
@@ -1250,21 +1334,21 @@ static int ftrace_process_regex(char *buff, int len, int enable)
 		return 0;
 	}
 
-	/* command fonud */
+	/* command found */
 
 	command = strsep(&next, ":");
 
-	if (strcmp(command, "mod") == 0) {
-		/* only match modules */
-		if (!next)
-			return -EINVAL;
-
-		mod = strsep(&next, ":");
-		ftrace_match_module_records(func, mod, enable);
-		return 0;
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(p->name, command) == 0) {
+			ret = p->func(func, command, next, enable);
+			goto out_unlock;
+		}
 	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 static ssize_t
-- 
cgit v1.1


From 52baf11922db7377b580dd5448a07f71c6a35611 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 01:15:39 -0500
Subject: ftrace: convert ftrace_lock from a spinlock to mutex

Impact: clean up

The older versions of ftrace required doing the ftrace list
search under atomic context. Now all the calls are in non-atomic
context. There is no reason to keep the ftrace_lock as a spinlock.

This patch converts it to a mutex.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 51 +++++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45a44c4..4771732 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -61,7 +61,7 @@ int function_trace_stop;
  */
 static int ftrace_disabled __read_mostly;
 
-static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
 static DEFINE_MUTEX(ftrace_start_lock);
 
@@ -134,8 +134,7 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	ops->next = ftrace_list;
 	/*
@@ -172,7 +171,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 #endif
 	}
 
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return 0;
 }
@@ -182,8 +181,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	/*
 	 * If we are removing the last function, then simply point
@@ -224,7 +222,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	}
 
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return ret;
 }
@@ -233,8 +231,7 @@ static void ftrace_update_pid_func(void)
 {
 	ftrace_func_t func;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	if (ftrace_trace_function == ftrace_stub)
 		goto out;
@@ -256,7 +253,7 @@ static void ftrace_update_pid_func(void)
 #endif
 
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -358,15 +355,12 @@ void ftrace_release(void *start, unsigned long size)
 	if (ftrace_disabled || !start)
 		return;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
-
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e))
 			ftrace_free_rec(rec);
 	} while_for_each_ftrace_rec();
-
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -803,8 +797,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
  retry:
 	if (iter->idx >= iter->pg->index) {
 		if (iter->pg->next) {
@@ -833,7 +826,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			goto retry;
 		}
 	}
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return rec;
 }
@@ -962,8 +955,7 @@ static void ftrace_filter_reset(int enable)
 	struct dyn_ftrace *rec;
 	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 0;
 	do_for_each_ftrace_rec(pg, rec) {
@@ -971,8 +963,7 @@ static void ftrace_filter_reset(int enable)
 			continue;
 		rec->flags &= ~type;
 	} while_for_each_ftrace_rec();
-
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static int
@@ -1151,8 +1142,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 
 	search_len = strlen(search);
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1171,7 +1161,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 		if (enable && (rec->flags & FTRACE_FL_FILTER))
 			ftrace_filtered = 1;
 	} while_for_each_ftrace_rec();
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static int
@@ -1218,8 +1208,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 		search_len = strlen(search);
 	}
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1236,7 +1225,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 			ftrace_filtered = 1;
 
 	} while_for_each_ftrace_rec();
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 /*
@@ -1676,9 +1665,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	if (ftrace_disabled)
 		return -ENODEV;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
-
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
@@ -1699,7 +1686,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 		}
 	} while_for_each_ftrace_rec();
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
 }
-- 
cgit v1.1


From e6ea44e9b4c12325337cd1c06103cd515a1c02b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 01:42:44 -0500
Subject: ftrace: consolidate mutexes

Impact: clean up

Now that ftrace_lock is a mutex, there is no reason to have three
different mutexes protecting similar data. All the mutex paths
are not in hot paths, so having a mutex to cover more data is
not a problem.

This patch removes the ftrace_sysctl_lock and ftrace_start_lock
and uses the ftrace_lock to protect the locations that were protected
by these locks. By doing so, this change also removes some of
the lock nesting that was taking place.

There are still more mutexes in ftrace.c that can probably be
consolidated, but they can be dealt with later. We need to be careful
about the way the locks are nested, and by consolidating, we can cause
a recursive deadlock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 68 ++++++++++++++++-----------------------------------
 1 file changed, 21 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4771732..157d4f6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@ int function_trace_stop;
 static int ftrace_disabled __read_mostly;
 
 static DEFINE_MUTEX(ftrace_lock);
-static DEFINE_MUTEX(ftrace_sysctl_lock);
-static DEFINE_MUTEX(ftrace_start_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
@@ -134,8 +132,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	mutex_lock(&ftrace_lock);
-
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -171,17 +167,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 #endif
 	}
 
-	mutex_unlock(&ftrace_lock);
-
 	return 0;
 }
 
 static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	struct ftrace_ops **p;
-	int ret = 0;
-
-	mutex_lock(&ftrace_lock);
 
 	/*
 	 * If we are removing the last function, then simply point
@@ -190,17 +181,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
 		ftrace_list = &ftrace_list_end;
-		goto out;
+		return 0;
 	}
 
 	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
 		if (*p == ops)
 			break;
 
-	if (*p != ops) {
-		ret = -1;
-		goto out;
-	}
+	if (*p != ops)
+		return -1;
 
 	*p = (*p)->next;
 
@@ -221,10 +210,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 		}
 	}
 
- out:
-	mutex_unlock(&ftrace_lock);
-
-	return ret;
+	return 0;
 }
 
 static void ftrace_update_pid_func(void)
@@ -622,13 +608,10 @@ static void ftrace_startup(int command)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up++;
 	command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_startup_enable(command);
-
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_shutdown(int command)
@@ -636,7 +619,6 @@ static void ftrace_shutdown(int command)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up--;
 	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
@@ -647,11 +629,9 @@ static void ftrace_shutdown(int command)
 	}
 
 	if (!command || !ftrace_enabled)
-		goto out;
+		return;
 
 	ftrace_run_update_code(command);
- out:
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_startup_sysctl(void)
@@ -661,7 +641,6 @@ static void ftrace_startup_sysctl(void)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
@@ -669,7 +648,6 @@ static void ftrace_startup_sysctl(void)
 		command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_run_update_code(command);
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_shutdown_sysctl(void)
@@ -679,13 +657,11 @@ static void ftrace_shutdown_sysctl(void)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	/* ftrace_start_up is true if ftrace is running */
 	if (ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	ftrace_run_update_code(command);
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static cycle_t		ftrace_update_time;
@@ -1502,12 +1478,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 	}
 
-	mutex_lock(&ftrace_sysctl_lock);
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	if (ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-	mutex_unlock(&ftrace_start_lock);
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 
 	kfree(iter);
 	mutex_unlock(&ftrace_regex_lock);
@@ -1824,7 +1798,7 @@ static int ftrace_convert_nops(struct module *mod,
 	unsigned long addr;
 	unsigned long flags;
 
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
@@ -1843,7 +1817,7 @@ static int ftrace_convert_nops(struct module *mod,
 	local_irq_save(flags);
 	ftrace_update_code(mod);
 	local_irq_restore(flags);
-	mutex_unlock(&ftrace_start_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return 0;
 }
@@ -2016,7 +1990,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	if (val < 0) {
 		/* disable pid tracing */
 		if (!ftrace_pid_trace)
@@ -2055,7 +2029,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	ftrace_startup_enable(0);
 
  out:
-	mutex_unlock(&ftrace_start_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return cnt;
 }
@@ -2118,12 +2092,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	if (unlikely(ftrace_disabled))
 		return -1;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ret = __register_ftrace_function(ops);
 	ftrace_startup(0);
 
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
@@ -2137,10 +2111,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 	ret = __unregister_ftrace_function(ops);
 	ftrace_shutdown(0);
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return ret;
 }
@@ -2155,7 +2129,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
@@ -2184,7 +2158,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	}
 
  out:
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
@@ -2296,7 +2270,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 {
 	int ret = 0;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
@@ -2314,13 +2288,13 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
 void unregister_ftrace_graph(void)
 {
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2328,7 +2302,7 @@ void unregister_ftrace_graph(void)
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 /* Allocate a return stack for newly created task */
-- 
cgit v1.1


From 59df055f1991c9fc0c71a9230663c39188f6972f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 15:29:06 -0500
Subject: ftrace: trace different functions with a different tracer

Impact: new feature

Currently, the function tracer only gives you an ability to hook
a tracer to all functions being traced. The dynamic function trace
allows you to pick and choose which of those functions will be
traced, but all functions being traced will call all tracers that
registered with the function tracer.

This patch adds a new feature that allows a tracer to hook to specific
functions, even when all functions are being traced. It allows for
different functions to call different tracer hooks.

The way this is accomplished is by a special function that will hook
to the function tracer and will set up a hash table knowing which
tracer hook to call with which function. This is the most general
and easiest method to accomplish this. Later, an arch may choose
to supply their own method in changing the mcount call of a function
to call a different tracer. But that will be an exercise for the
future.

To register a function:

 struct ftrace_hook_ops {
	void			(*func)(unsigned long ip,
					unsigned long parent_ip,
					void **data);
	int			(*callback)(unsigned long ip, void **data);
	void			(*free)(void **data);
 };

 int register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				  void *data);

glob is a simple glob to search for the functions to hook.
ops is a pointer to the operations (listed below)
data is the default data to be passed to the hook functions when traced

ops:
 func is the hook function to call when the functions are traced
 callback is a callback function that is called when setting up the hash.
   That is, if the tracer needs to do something special for each
   function, that is being traced, and wants to give each function
   its own data. The address of the entry data is passed to this
   callback, so that the callback may wish to update the entry to
   whatever it would like.
 free is a callback for when the entry is freed. In case the tracer
   allocated any data, it is give the chance to free it.

To unregister we have three functions:

  void
  unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				void *data)

This will unregister all hooks that match glob, point to ops, and
have its data matching data. (note, if glob is NULL, blank or '*',
all functions will be tested).

  void
  unregister_ftrace_function_hook_func(char *glob,
				 struct ftrace_hook_ops *ops)

This will unregister all functions matching glob that has an entry
pointing to ops.

  void unregister_ftrace_function_hook_all(char *glob)

This simply unregisters all funcs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 247 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 157d4f6..0b80e32 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
+#include <linux/hash.h>
 
 #include <asm/ftrace.h>
 
@@ -1245,6 +1246,252 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+static void
+function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long key;
+	int resched;
+
+	key = hash_long(ip, FTRACE_HASH_BITS);
+
+	hhd = &ftrace_func_hash[key];
+
+	if (hlist_empty(hhd))
+		return;
+
+	/*
+	 * Disable preemption for these calls to prevent a RCU grace
+	 * period. This syncs the hash iteration and freeing of items
+	 * on the hash. rcu_read_lock is too dangerous here.
+	 */
+	resched = ftrace_preempt_disable();
+	hlist_for_each_entry_rcu(entry, n, hhd, node) {
+		if (entry->ip == ip)
+			entry->ops->func(ip, parent_ip, &entry->data);
+	}
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_hook_ops __read_mostly =
+{
+	.func = function_trace_hook_call,
+};
+
+static int ftrace_hook_registered;
+
+static void __enable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			break;
+	}
+	/* Nothing registered? */
+	if (i == FTRACE_FUNC_HASHSIZE)
+		return;
+
+	__register_ftrace_function(&trace_hook_ops);
+	ftrace_startup(0);
+	ftrace_hook_registered = 1;
+}
+
+static void __disable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (!ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			return;
+	}
+
+	/* no more funcs left */
+	__unregister_ftrace_function(&trace_hook_ops);
+	ftrace_shutdown(0);
+	ftrace_hook_registered = 0;
+}
+
+
+static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+{
+	struct ftrace_func_hook *entry =
+		container_of(rhp, struct ftrace_func_hook, rcu);
+
+	if (entry->ops->free)
+		entry->ops->free(&entry->data);
+	kfree(entry);
+}
+
+
+int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data)
+{
+	struct ftrace_func_hook *entry;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long key;
+	int type, len, not;
+	int count = 0;
+	char *search;
+
+	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+	len = strlen(search);
+
+	/* we do not support '!' for function hooks */
+	if (WARN_ON(not))
+		return -EINVAL;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (!ftrace_match_record(rec, search, len, type))
+			continue;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry) {
+			/* If we did not hook to any, then return error */
+			if (!count)
+				count = -ENOMEM;
+			goto out_unlock;
+		}
+
+		count++;
+
+		entry->data = data;
+
+		/*
+		 * The caller might want to do something special
+		 * for each function we find. We call the callback
+		 * to give the caller an opportunity to do so.
+		 */
+		if (ops->callback) {
+			if (ops->callback(rec->ip, &entry->data) < 0) {
+				/* caller does not like this func */
+				kfree(entry);
+				continue;
+			}
+		}
+
+		entry->ops = ops;
+		entry->ip = rec->ip;
+
+		key = hash_long(entry->ip, FTRACE_HASH_BITS);
+		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+	} while_for_each_ftrace_rec();
+	__enable_ftrace_function_hook();
+
+ out_unlock:
+	mutex_unlock(&ftrace_lock);
+
+	return count;
+}
+
+enum {
+	HOOK_TEST_FUNC		= 1,
+	HOOK_TEST_DATA		= 2
+};
+
+static void
+__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				  void *data, int flags)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_node *n, *tmp;
+	char str[KSYM_SYMBOL_LEN];
+	int type = MATCH_FULL;
+	int i, len = 0;
+	char *search;
+
+	if (glob && (strcmp(glob, "*") || !strlen(glob)))
+		glob = NULL;
+	else {
+		int not;
+
+		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+		len = strlen(search);
+
+		/* we do not support '!' for function hooks */
+		if (WARN_ON(not))
+			return;
+	}
+
+	mutex_lock(&ftrace_lock);
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+
+		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+
+			/* break up if statements for readability */
+			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+				continue;
+
+			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+				continue;
+
+			/* do this last, since it is the most expensive */
+			if (glob) {
+				kallsyms_lookup(entry->ip, NULL, NULL,
+						NULL, str);
+				if (!ftrace_match(str, glob, len, type))
+					continue;
+			}
+
+			hlist_del(&entry->node);
+			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+		}
+	}
+	__disable_ftrace_function_hook();
+	mutex_unlock(&ftrace_lock);
+}
+
+void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data)
+{
+	__unregister_ftrace_function_hook(glob, ops, data,
+					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+{
+	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+}
+
+void unregister_ftrace_function_hook_all(char *glob)
+{
+	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+}
+
 static LIST_HEAD(ftrace_commands);
 static DEFINE_MUTEX(ftrace_cmd_mutex);
 
-- 
cgit v1.1


From 988ae9d6b2bc3ebdc1a488490250a6812f85e9d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:17:02 -0500
Subject: ring-buffer: add tracing_is_on to test if ring buffer is enabled

This patch adds the tracing_is_on() interface to tell if the ring
buffer is turned on or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2b4626c..8f19f1a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -98,6 +98,15 @@ void tracing_off_permanent(void)
 	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+	return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
 #include "trace.h"
 
 /* Up this if you want to test the TIME_EXTENTS and normalization */
-- 
cgit v1.1


From 23b4ff3aa479c9e3bb23cb6b2d0a97878399784a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:04:24 -0500
Subject: ftrace: add traceon traceoff commands to enable/disable the buffers

This patch adds the new function selection commands traceon and
traceoff. traceon sets the function to enable the ring buffers
while traceoff disables the ring buffers.  You can pass in the
number of times you want the command to be executed when the function
is hit. It will only execute if the state of the buffers are not
already in that state.

Example:

 # echo do_fork:traceon:4

Will enable the ring buffers if they are disabled every time it
hits do_fork, up to 4 times.

 # echo sys_close:traceoff

This will disable the ring buffers every time (unlimited) when
sys_close is called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 135 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 36bf956..5c95708 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
+#include <linux/ring_buffer.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
@@ -231,9 +232,143 @@ static struct tracer function_trace __read_mostly =
 #endif
 };
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	long *count = (long *)data;
+
+	if (tracing_is_on())
+		return;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	long *count = (long *)data;
+
+	if (!tracing_is_on())
+		return;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_off();
+}
+
+static struct ftrace_hook_ops traceon_hook_ops = {
+	.func			= ftrace_traceon,
+};
+
+static struct ftrace_hook_ops traceoff_hook_ops = {
+	.func			= ftrace_traceoff,
+};
+
+static int
+ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+{
+	struct ftrace_hook_ops *ops;
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = &traceon_hook_ops;
+	else
+		ops = &traceoff_hook_ops;
+
+	unregister_ftrace_function_hook_func(glob, ops);
+
+	return 0;
+}
+
+static int
+ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+{
+	struct ftrace_hook_ops *ops;
+	void *count = (void *)-1;
+	char *number;
+	int ret;
+
+	/* hash funcs only work with set_ftrace_filter */
+	if (!enable)
+		return -EINVAL;
+
+	if (glob[0] == '!')
+		return ftrace_trace_onoff_unreg(glob+1, cmd, param);
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = &traceon_hook_ops;
+	else
+		ops = &traceoff_hook_ops;
+
+	if (!param)
+		goto out_reg;
+
+	number = strsep(&param, ":");
+
+	if (!strlen(number))
+		goto out_reg;
+
+	/*
+	 * We use the callback data field (which is a pointer)
+	 * as our counter.
+	 */
+	ret = strict_strtoul(number, 0, (unsigned long *)&count);
+	if (ret)
+		return ret;
+
+ out_reg:
+	ret = register_ftrace_function_hook(glob, ops, count);
+
+	return ret;
+}
+
+static struct ftrace_func_command ftrace_traceon_cmd = {
+	.name			= "traceon",
+	.func			= ftrace_trace_onoff_callback,
+};
+
+static struct ftrace_func_command ftrace_traceoff_cmd = {
+	.name			= "traceoff",
+	.func			= ftrace_trace_onoff_callback,
+};
+
+static int __init init_func_cmd_traceon(void)
+{
+	int ret;
+
+	ret = register_ftrace_command(&ftrace_traceoff_cmd);
+	if (ret)
+		return ret;
+
+	ret = register_ftrace_command(&ftrace_traceon_cmd);
+	if (ret)
+		unregister_ftrace_command(&ftrace_traceoff_cmd);
+	return ret;
+}
+#else
+static inline int init_func_cmd_traceon(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 static __init int init_function_trace(void)
 {
+	init_func_cmd_traceon();
 	return register_tracer(&function_trace);
 }
 
 device_initcall(init_function_trace);
+
-- 
cgit v1.1


From 8fc0c701c5b6c0c3e242758c3acef6f9047940a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 15:28:00 -0500
Subject: ftrace: show selected functions in set_ftrace_filter

This patch adds output to show what functions have tracer hooks
attached to them.

  # echo 'sys_open:traceon:4' > /debug/tracing/set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:ftrace_traceon:0000000000000004

  # echo 'do_fork:traceoff:' > set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:ftrace_traceon:0000000000000002
 do_fork:ftrace_traceoff:ffffffffffffffff

Note the 4 changed to a 2. This is because The code was executed twice
since the traceoff was added. If a cat is done again:

 #### all functions enabled ####
 sys_open:ftrace_traceon
 do_fork:ftrace_traceoff:ffffffffffffffff

The number disappears. That is because it will not print a NULL.

Callbacks to allow the tracer to pretty print will be implemented soon.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 123 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 103 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0b80e32..1e05884 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,14 +45,14 @@
 			ftrace_kill();		\
 	} while (0)
 
+/* hash bits for specific function selection */
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
@@ -248,6 +248,21 @@ static void ftrace_update_pid_func(void)
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -750,12 +765,14 @@ enum {
 	FTRACE_ITER_NOTRACE	= (1 << 2),
 	FTRACE_ITER_FAILURES	= (1 << 3),
 	FTRACE_ITER_PRINTALL	= (1 << 4),
+	FTRACE_ITER_HASH	= (1 << 5),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
 	struct ftrace_page	*pg;
+	int			hidx;
 	int			idx;
 	unsigned		flags;
 	unsigned char		buffer[FTRACE_BUFF_MAX+1];
@@ -764,17 +781,86 @@ struct ftrace_iterator {
 };
 
 static void *
+t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct hlist_node *hnd = v;
+	struct hlist_head *hhd;
+
+	WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
+
+	(*pos)++;
+
+ retry:
+	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+		return NULL;
+
+	hhd = &ftrace_func_hash[iter->hidx];
+
+	if (hlist_empty(hhd)) {
+		iter->hidx++;
+		hnd = NULL;
+		goto retry;
+	}
+
+	if (!hnd)
+		hnd = hhd->first;
+	else {
+		hnd = hnd->next;
+		if (!hnd) {
+			iter->hidx++;
+			goto retry;
+		}
+	}
+
+	return hnd;
+}
+
+static void *t_hash_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+
+	iter->flags |= FTRACE_ITER_HASH;
+
+	return t_hash_next(m, p, pos);
+}
+
+static int t_hash_show(struct seq_file *m, void *v)
+{
+	struct ftrace_func_hook *rec;
+	struct hlist_node *hnd = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "%s:", str);
+
+	kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
+	seq_printf(m, "%s", str);
+
+	if (rec->data)
+		seq_printf(m, ":%p", rec->data);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = NULL;
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_next(m, v, pos);
+
 	(*pos)++;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
 
-	mutex_lock(&ftrace_lock);
  retry:
 	if (iter->idx >= iter->pg->index) {
 		if (iter->pg->next) {
@@ -803,7 +889,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			goto retry;
 		}
 	}
-	mutex_unlock(&ftrace_lock);
 
 	return rec;
 }
@@ -813,6 +898,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 
+	mutex_lock(&ftrace_lock);
 	/*
 	 * For set_ftrace_filter reading, if we have the filter
 	 * off, we can short cut and just print out that all
@@ -820,12 +906,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	 */
 	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
 		if (*pos > 0)
-			return NULL;
+			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
 		(*pos)++;
 		return iter;
 	}
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_start(m, pos);
+
 	if (*pos > 0) {
 		if (iter->idx < 0)
 			return p;
@@ -835,11 +924,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 
 	p = t_next(m, p, pos);
 
+	if (!p)
+		return t_hash_start(m, pos);
+
 	return p;
 }
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	mutex_unlock(&ftrace_lock);
 }
 
 static int t_show(struct seq_file *m, void *v)
@@ -848,6 +941,9 @@ static int t_show(struct seq_file *m, void *v)
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_show(m, v);
+
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		seq_printf(m, "#### all functions enabled ####\n");
 		return 0;
@@ -1246,19 +1342,6 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
-#define FTRACE_HASH_BITS 7
-#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_hook {
-	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
-};
-
 static void
 function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 {
-- 
cgit v1.1


From 809dcf29ce4e1723709910878e050bd187617e0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:06:01 -0500
Subject: ftrace: add pretty print to selected fuction traces

This patch adds a call back for the tracers that have hooks to
selected functions. This allows the tracer to show better output
in the set_ftrace_filter file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e05884..6533c1d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -834,6 +834,9 @@ static int t_hash_show(struct seq_file *m, void *v)
 
 	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
 
+	if (rec->ops->print)
+		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-- 
cgit v1.1


From e110e3d1eaa0f9628918be67ddd32e8ad65a2871 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:38:13 -0500
Subject: ftrace: add pretty print function for traceon and traceoff hooks

This patch adds a pretty print version of traceon and traceoff
output for set_ftrace_filter.

  # echo 'sys_open:traceon:4' > set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:traceon:count=4

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5c95708..f520aa4 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -267,15 +267,43 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 	tracing_off();
 }
 
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_hook_ops *ops, void *data);
+
 static struct ftrace_hook_ops traceon_hook_ops = {
 	.func			= ftrace_traceon,
+	.print			= ftrace_trace_onoff_print,
 };
 
 static struct ftrace_hook_ops traceoff_hook_ops = {
 	.func			= ftrace_traceoff,
+	.print			= ftrace_trace_onoff_print,
 };
 
 static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_hook_ops *ops, void *data)
+{
+	char str[KSYM_SYMBOL_LEN];
+	long count = (long)data;
+
+	kallsyms_lookup(ip, NULL, NULL, NULL, str);
+	seq_printf(m, "%s:", str);
+
+	if (ops == &traceon_hook_ops)
+		seq_printf(m, "traceon");
+	else
+		seq_printf(m, "traceoff");
+
+	if (count != -1)
+		seq_printf(m, ":count=%ld", count);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
 	struct ftrace_hook_ops *ops;
-- 
cgit v1.1


From 73d3fd96e77745742f3750b7b19ee42204adc210 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 11:48:18 +0100
Subject: ftrace: fix !CONFIG_DYNAMIC_FTRACE ftrace_swapper_pid definition

Impact: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6533c1d..4e6c87e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,14 +243,16 @@ static void ftrace_update_pid_func(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
 #ifdef CONFIG_DYNAMIC_FTRACE
+
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
 struct ftrace_func_hook {
-- 
cgit v1.1


From 6a24a244cd3a02d5b290293c32fcf2c6e92b4235 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 11:20:26 -0500
Subject: ftrace: clean up coding style

Ingo Molnar pointed out some coding style issues with the recent ftrace
updates. This patch cleans them up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c          | 24 +++++++++++++-----------
 kernel/trace/trace_functions.c |  1 -
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4e6c87e..af9d95c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -460,8 +460,8 @@ static void ftrace_bug(int failed, unsigned long ip)
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
-	unsigned long ip, fl;
 	unsigned long ftrace_addr;
+	unsigned long ip, fl;
 
 	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
@@ -530,9 +530,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 static void ftrace_replace_code(int enable)
 {
-	int failed;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int failed;
 
 	do_for_each_ftrace_rec(pg, rec) {
 		/*
@@ -1208,14 +1208,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
 
 static void ftrace_match_records(char *buff, int len, int enable)
 {
-	char *search;
+	unsigned int search_len;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	unsigned long flag;
+	char *search;
 	int type;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned search_len;
 	int not;
 
+	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	type = ftrace_setup_glob(buff, len, &search, &not);
 
 	search_len = strlen(search);
@@ -1263,14 +1264,16 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
 
 static void ftrace_match_module_records(char *buff, char *mod, int enable)
 {
-	char *search = buff;
+	unsigned search_len = 0;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned search_len = 0;
+	char *search = buff;
+	unsigned long flag;
 	int not = 0;
 
+	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+
 	/* blank or '*' mean the same */
 	if (strcmp(buff, "*") == 0)
 		buff[0] = 0;
@@ -1442,8 +1445,8 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	struct ftrace_func_hook *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
-	unsigned long key;
 	int type, len, not;
+	unsigned long key;
 	int count = 0;
 	char *search;
 
@@ -1623,8 +1626,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
 
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	struct ftrace_func_command *p;
 	char *func, *command, *next = buff;
+	struct ftrace_func_command *p;
 	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
@@ -2392,7 +2395,6 @@ static __init int ftrace_init_debugfs(void)
 			   "'set_ftrace_pid' entry\n");
 	return 0;
 }
-
 fs_initcall(ftrace_init_debugfs);
 
 /**
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f520aa4..021a574 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -397,6 +397,5 @@ static __init int init_function_trace(void)
 	init_func_cmd_traceon();
 	return register_tracer(&function_trace);
 }
-
 device_initcall(init_function_trace);
 
-- 
cgit v1.1


From b6887d7916e44c1d8913084fb6aa5004d9473f1a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 12:32:04 -0500
Subject: ftrace: rename _hook to _probe

Impact: clean up

Ingo Molnar did not like the _hook naming convention used by the
select function tracer. Luis Claudio R. Goncalves suggested using
the "_probe" extension. This patch implements the change of
calling the functions and variables "_hook" and replacing them
with "_probe".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c          | 78 +++++++++++++++++++++---------------------
 kernel/trace/trace_functions.c | 26 +++++++-------
 2 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af9d95c..330a059 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -255,9 +255,9 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
-struct ftrace_func_hook {
+struct ftrace_func_probe {
 	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
+	struct ftrace_probe_ops	*ops;
 	unsigned long		flags;
 	unsigned long		ip;
 	void			*data;
@@ -830,11 +830,11 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 
 static int t_hash_show(struct seq_file *m, void *v)
 {
-	struct ftrace_func_hook *rec;
+	struct ftrace_func_probe *rec;
 	struct hlist_node *hnd = v;
 	char str[KSYM_SYMBOL_LEN];
 
-	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1351,9 +1351,9 @@ static int __init ftrace_mod_cmd_init(void)
 device_initcall(ftrace_mod_cmd_init);
 
 static void
-function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
@@ -1379,18 +1379,18 @@ function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
-static struct ftrace_ops trace_hook_ops __read_mostly =
+static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_hook_call,
+	.func = function_trace_probe_call,
 };
 
-static int ftrace_hook_registered;
+static int ftrace_probe_registered;
 
-static void __enable_ftrace_function_hook(void)
+static void __enable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (ftrace_hook_registered)
+	if (ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1402,16 +1402,16 @@ static void __enable_ftrace_function_hook(void)
 	if (i == FTRACE_FUNC_HASHSIZE)
 		return;
 
-	__register_ftrace_function(&trace_hook_ops);
+	__register_ftrace_function(&trace_probe_ops);
 	ftrace_startup(0);
-	ftrace_hook_registered = 1;
+	ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_hook(void)
+static void __disable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (!ftrace_hook_registered)
+	if (!ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1421,16 +1421,16 @@ static void __disable_ftrace_function_hook(void)
 	}
 
 	/* no more funcs left */
-	__unregister_ftrace_function(&trace_hook_ops);
+	__unregister_ftrace_function(&trace_probe_ops);
 	ftrace_shutdown(0);
-	ftrace_hook_registered = 0;
+	ftrace_probe_registered = 0;
 }
 
 
 static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 {
-	struct ftrace_func_hook *entry =
-		container_of(rhp, struct ftrace_func_hook, rcu);
+	struct ftrace_func_probe *entry =
+		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
 		entry->ops->free(&entry->data);
@@ -1439,10 +1439,10 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 
 
 int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type, len, not;
@@ -1453,7 +1453,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 	len = strlen(search);
 
-	/* we do not support '!' for function hooks */
+	/* we do not support '!' for function probes */
 	if (WARN_ON(not))
 		return -EINVAL;
 
@@ -1468,7 +1468,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry) {
-			/* If we did not hook to any, then return error */
+			/* If we did not process any, then return error */
 			if (!count)
 				count = -ENOMEM;
 			goto out_unlock;
@@ -1498,7 +1498,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
 
 	} while_for_each_ftrace_rec();
-	__enable_ftrace_function_hook();
+	__enable_ftrace_function_probe();
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
@@ -1507,15 +1507,15 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 }
 
 enum {
-	HOOK_TEST_FUNC		= 1,
-	HOOK_TEST_DATA		= 2
+	PROBE_TEST_FUNC		= 1,
+	PROBE_TEST_DATA		= 2
 };
 
 static void
-__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int type = MATCH_FULL;
@@ -1530,7 +1530,7 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 		len = strlen(search);
 
-		/* we do not support '!' for function hooks */
+		/* we do not support '!' for function probes */
 		if (WARN_ON(not))
 			return;
 	}
@@ -1542,10 +1542,10 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
 
 			/* break up if statements for readability */
-			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+			if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
 				continue;
 
-			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+			if ((flags & PROBE_TEST_DATA) && entry->data != data)
 				continue;
 
 			/* do this last, since it is the most expensive */
@@ -1560,27 +1560,27 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
-	__disable_ftrace_function_hook();
+	__disable_ftrace_function_probe();
 	mutex_unlock(&ftrace_lock);
 }
 
 void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data)
 {
-	__unregister_ftrace_function_hook(glob, ops, data,
-					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+	__unregister_ftrace_function_probe(glob, ops, data,
+					  PROBE_TEST_FUNC | PROBE_TEST_DATA);
 }
 
 void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 {
-	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
 }
 
-void unregister_ftrace_function_hook_all(char *glob)
+void unregister_ftrace_function_probe_all(char *glob)
 {
-	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);
 }
 
 static LIST_HEAD(ftrace_commands);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 021a574..6ea73ed 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -269,21 +269,21 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data);
+			 struct ftrace_probe_ops *ops, void *data);
 
-static struct ftrace_hook_ops traceon_hook_ops = {
+static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
 	.print			= ftrace_trace_onoff_print,
 };
 
-static struct ftrace_hook_ops traceoff_hook_ops = {
+static struct ftrace_probe_ops traceoff_probe_ops = {
 	.func			= ftrace_traceoff,
 	.print			= ftrace_trace_onoff_print,
 };
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data)
+			 struct ftrace_probe_ops *ops, void *data)
 {
 	char str[KSYM_SYMBOL_LEN];
 	long count = (long)data;
@@ -291,7 +291,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	kallsyms_lookup(ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-	if (ops == &traceon_hook_ops)
+	if (ops == &traceon_probe_ops)
 		seq_printf(m, "traceon");
 	else
 		seq_printf(m, "traceoff");
@@ -306,15 +306,15 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
-	unregister_ftrace_function_hook_func(glob, ops);
+	unregister_ftrace_function_probe_func(glob, ops);
 
 	return 0;
 }
@@ -322,7 +322,7 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 static int
 ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
 	char *number;
 	int ret;
@@ -336,9 +336,9 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
 	if (!param)
 		goto out_reg;
@@ -357,7 +357,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 		return ret;
 
  out_reg:
-	ret = register_ftrace_function_hook(glob, ops, count);
+	ret = register_ftrace_function_probe(glob, ops, count);
 
 	return ret;
 }
-- 
cgit v1.1


From af513098452b8887d7c0e15a39d7cb74479501bd Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:07:28 -0500
Subject: tracing: use the more proper parameter

Pass tsk to tracing_record_cmdline instead of current.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95f99a7..dc61e82 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -336,7 +336,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	data->rt_priority = tsk->rt_priority;
 
 	/* record this tasks comm */
-	tracing_record_cmdline(current);
+	tracing_record_cmdline(tsk);
 }
 
 static void
-- 
cgit v1.1


From d2ef7c2f0f9ab48c25eafc0ebad0df5f7930420b Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:09:47 -0500
Subject: tracing: fix the return value of trace selftest

This patch is to fix the return value of trace_selftest_startup_sysprof
and trace_selftest_startup_branch on failure.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_selftest.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0c9aa14..c72e749 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -622,7 +622,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
-		return 0;
+		return ret;
 	}
 
 	/* Sleep for a 1/10 of a second */
@@ -634,6 +634,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	trace->reset(tr);
 	tracing_start();
 
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
 	return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
@@ -661,6 +666,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	trace->reset(tr);
 	tracing_start();
 
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
-- 
cgit v1.1


From 73d8b8bc4f24a97a406d09c8268ac019f4ac661e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:10:02 -0500
Subject: tracing: fix typing mistake in hint message and comments

Impact: cleanup

Fix incorrect hint message in code and typos in comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_irqsoff.c      | 2 +-
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 kernel/trace/trace_selftest.c     | 4 ++--
 kernel/trace/trace_stat.c         | 2 +-
 kernel/trace/trace_sysprof.c      | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c6b442d..9e5ebd8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
 /*
- * trace irqs off criticall timings
+ * trace irqs off critical timings
  *
  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 30e14fe..82fbb5a 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -93,7 +93,7 @@ static int tracing_sched_register(void)
 	ret = register_trace_sched_switch(probe_sched_switch);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
-			" probe to kernel_sched_schedule\n");
+			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 96d7164..276c51a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -284,7 +284,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	ret = register_trace_sched_switch(probe_wakeup_sched_switch);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
-			" probe to kernel_sched_schedule\n");
+			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c72e749..01415f4 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,9 +107,9 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	func();
 
 	/*
-	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * Some archs *cough*PowerPC*cough* add characters to the
 	 * start of the function names. We simply put a '*' to
-	 * accomodate them.
+	 * accommodate them.
 	 */
 	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
 
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index eae9cef..39310e3 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -30,7 +30,7 @@ struct tracer_stat_session {
 	struct dentry		*file;
 };
 
-/* All of the sessions currently in use. Each stat file embeed one session */
+/* All of the sessions currently in use. Each stat file embed one session */
 static LIST_HEAD(all_stat_sessions);
 static DEFINE_MUTEX(all_stat_sessions_mutex);
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 7c9a2d8..c771af4 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -327,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 			d_tracer, NULL, &sysprof_sample_fops);
 	if (entry)
 		return;
-	pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
-- 
cgit v1.1


From 35ebf1caa4854ad5ba25f3a72967acc064147994 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:12:12 -0500
Subject: ftrace: show unlimited when traceon or traceoff has no counter

Impact: clean up

The traceon and traceoff function probes are confusing to developers
to what happens when a counter is not specified. This should help
clear things up.

 # echo "*:traceoff" > set_ftrace_filter
 # cat /debug/tracing/set_ftrace_filter

  #### all functions enabled ####
  do_fork:traceoff:unlimited

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 6ea73ed..4c113a8 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -296,7 +296,9 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	else
 		seq_printf(m, "traceoff");
 
-	if (count != -1)
+	if (count == -1)
+		seq_printf(m, ":unlimited\n");
+	else
 		seq_printf(m, ":count=%ld", count);
 	seq_putc(m, '\n');
 
-- 
cgit v1.1


From 6eaaa5d57e76c454479833fc8594cd7c3b75c789 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 11 Feb 2009 02:25:00 +0100
Subject: tracing/core: use appropriate waiting on trace_pipe

Impact: api and pipe waiting change

Currently, the waiting used in tracing_read_pipe() is done through a
100 msecs schedule_timeout() loop which periodically check if there
are traces on the buffer.

This can cause small latencies for programs which are reading the incoming
events.

This patch makes the reader waiting for the trace_wait waitqueue except
for few tracers such as the sched and functions tracers which might be
already hold the runqueue lock while waking up the reader.

This is performed through a new callback wait_pipe() on struct tracer.
If none is implemented on a specific tracer, the default waiting for
trace_wait queue is attached.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 62 ++++++++++++++++++++++++------------
 kernel/trace/trace.h                 | 25 +++++++++++++--
 kernel/trace/trace_functions.c       |  1 +
 kernel/trace/trace_functions_graph.c |  1 +
 kernel/trace/trace_sched_switch.c    |  1 +
 kernel/trace/trace_sched_wakeup.c    |  1 +
 6 files changed, 67 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc61e82..881a944 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -499,6 +499,9 @@ __acquires(kernel_lock)
 	else
 		if (!type->flags->opts)
 			type->flags->opts = dummy_tracer_opt;
+	if (!type->wait_pipe)
+		type->wait_pipe = default_wait_pipe;
+
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	if (type->selftest && !tracing_selftest_disabled) {
@@ -1064,7 +1067,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 }
 
 void
@@ -2392,6 +2398,38 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
+
+void default_wait_pipe(struct trace_iterator *iter)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+
+	if (trace_empty(iter))
+		schedule();
+
+	finish_wait(&trace_wait, &wait);
+}
+
+/*
+ * This is a make-shift waitqueue.
+ * A tracer might use this callback on some rare cases:
+ *
+ *  1) the current tracer might hold the runqueue lock when it wakes up
+ *     a reader, hence a deadlock (sched, function, and function graph tracers)
+ *  2) the function tracers, trace all functions, we don't want
+ *     the overhead of calling wake_up and friends
+ *     (and tracing them too)
+ *
+ *     Anyway, this is really very primitive wakeup.
+ */
+void poll_wait_pipe(struct trace_iterator *iter)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	/* sleep for 100 msecs, and try again. */
+	schedule_timeout(HZ / 10);
+}
+
 /* Must be called with trace_types_lock mutex held. */
 static int tracing_wait_pipe(struct file *filp)
 {
@@ -2403,30 +2441,14 @@ static int tracing_wait_pipe(struct file *filp)
 			return -EAGAIN;
 		}
 
-		/*
-		 * This is a make-shift waitqueue. The reason we don't use
-		 * an actual wait queue is because:
-		 *  1) we only ever have one waiter
-		 *  2) the tracing, traces all functions, we don't want
-		 *     the overhead of calling wake_up and friends
-		 *     (and tracing them too)
-		 *     Anyway, this is really very primitive wakeup.
-		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-		iter->tr->waiter = current;
-
 		mutex_unlock(&trace_types_lock);
 
-		/* sleep for 100 msecs, and try again. */
-		schedule_timeout(HZ/10);
+		iter->trace->wait_pipe(iter);
 
 		mutex_lock(&trace_types_lock);
 
-		iter->tr->waiter = NULL;
-
-		if (signal_pending(current)) {
+		if (signal_pending(current))
 			return -EINTR;
-		}
 
 		if (iter->trace != current_trace)
 			return 0;
@@ -2442,8 +2464,6 @@ static int tracing_wait_pipe(struct file *filp)
 		 */
 		if (!tracer_enabled && iter->pos)
 			break;
-
-		continue;
 	}
 
 	return 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dbff020..eed732c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -337,18 +337,34 @@ struct tracer_flags {
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
 
-/*
- * A specific tracer, represented by methods that operate on a trace array:
+/**
+ * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * @name: the name chosen to select it on the available_tracers file
+ * @init: called when one switches to this tracer (echo name > current_tracer)
+ * @reset: called when one switches to another tracer
+ * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
+ * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @open: called when the trace file is opened
+ * @pipe_open: called when the trace_pipe file is opened
+ * @wait_pipe: override how the user waits for traces on trace_pipe
+ * @close: called when the trace file is released
+ * @read: override the default read callback on trace_pipe
+ * @splice_read: override the default splice_read callback on trace_pipe
+ * @selftest: selftest to run on boot (see trace_selftest.c)
+ * @print_headers: override the first lines that describe your columns
+ * @print_line: callback that prints a trace
+ * @set_flag: signals one of your private flags changed (trace_options file)
+ * @flags: your private flags
  */
 struct tracer {
 	const char		*name;
-	/* Your tracer should raise a warning if init fails */
 	int			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*start)(struct trace_array *tr);
 	void			(*stop)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
+	void			(*wait_pipe)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
@@ -432,6 +448,9 @@ void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
 
+void default_wait_pipe(struct trace_iterator *iter);
+void poll_wait_pipe(struct trace_iterator *iter);
+
 void ftrace(struct trace_array *tr,
 			    struct trace_array_cpu *data,
 			    unsigned long ip,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 4c113a8..c9a0b7d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -225,6 +225,7 @@ static struct tracer function_trace __read_mostly =
 	.init		= function_trace_init,
 	.reset		= function_trace_reset,
 	.start		= function_trace_start,
+	.wait_pipe	= poll_wait_pipe,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 519a0ca..0ff5cb6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -757,6 +757,7 @@ static struct tracer graph_trace __read_mostly = {
 	.name	     	= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
+	.wait_pipe	= poll_wait_pipe,
 	.init	     	= graph_trace_init,
 	.reset	     	= graph_trace_reset,
 	.print_line	= print_graph_function,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 82fbb5a..77132c2 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -221,6 +221,7 @@ static struct tracer sched_switch_trace __read_mostly =
 	.reset		= sched_switch_trace_reset,
 	.start		= sched_switch_trace_start,
 	.stop		= sched_switch_trace_stop,
+	.wait_pipe	= poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 276c51a..db55f7a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -380,6 +380,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.reset		= wakeup_tracer_reset,
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
+	.wait_pipe	= poll_wait_pipe,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
-- 
cgit v1.1


From fa7c7f6e11f70d62505074a8b30a776236850dec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 11 Feb 2009 02:51:30 +0100
Subject: tracing/core: remove unused parameter in tracing_fill_pipe_page()

Impact: cleanup

The struct page *pages parameter is unused.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 881a944..e1f3b99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2571,8 +2571,7 @@ static struct pipe_buf_operations tracing_pipe_buf_ops = {
 };
 
 static size_t
-tracing_fill_pipe_page(struct page *pages, size_t rem,
-			struct trace_iterator *iter)
+tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
 {
 	size_t count;
 	int ret;
@@ -2649,7 +2648,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		if (!pages[i])
 			break;
 
-		rem = tracing_fill_pipe_page(pages[i], rem, iter);
+		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-- 
cgit v1.1


From 712406a6bf59ebf4a00358bb59a4a2a1b2953d90 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Feb 2009 10:54:03 -0800
Subject: tracing/function-graph-tracer: make arch generic push pop functions

There is nothing really arch specific of the push and pop functions
used by the function graph tracer. This patch moves them to generic
code.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 75 ++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e..dce71a5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -42,6 +42,81 @@ static struct tracer_flags tracer_flags = {
 /* pid on the last trace processed */
 static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
 
+/* Add a function return address to the trace stack on thread info.*/
+int
+ftrace_push_return_trace(unsigned long ret, unsigned long long time,
+			 unsigned long func, int *depth)
+{
+	int index;
+
+	if (!current->ret_stack)
+		return -EBUSY;
+
+	/* The return trace stack is full */
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
+		return -EBUSY;
+	}
+
+	index = ++current->curr_ret_stack;
+	barrier();
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
+	*depth = index;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+	int index;
+
+	index = current->curr_ret_stack;
+
+	if (unlikely(index < 0)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic, otherwise we have no where to go */
+		*ret = (unsigned long)panic;
+		return;
+	}
+
+	*ret = current->ret_stack[index].ret;
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
+	barrier();
+	current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_graph_ret trace;
+	unsigned long ret;
+
+	ftrace_pop_return_trace(&trace, &ret);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_graph_return(&trace);
+
+	if (unlikely(!ret)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic. What else to do? */
+		ret = (unsigned long)panic;
+	}
+
+	return ret;
+}
+
 static int graph_trace_init(struct trace_array *tr)
 {
 	int cpu, ret;
-- 
cgit v1.1


From d1f9cbd78841f1a797c77e9117e4882f932c2ef6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 18 Feb 2009 04:25:25 +0100
Subject: tracing/function-graph-tracer: fix traces weirdness while absolute
 time printing

Impact: trace output cleanup/reordering

When an interrupt occurs and and the abstime option is selected:

  echo funcgraph-abstime > /debug/tracing/trace_options

then we observe broken traces:

30581.025422 |   0)   Xorg-4291    |   0.503 us    |      idle_cpu();
30581.025424 |   0)   Xorg-4291    |   2.576 us    |    }
30581.025424 |   0)   Xorg-4291    | + 75.771 us   |  }
 0)   Xorg-4291    |   <========== |
30581.025425 |   0)   Xorg-4291    |               |  schedule() {
30581.025426 |   0)   Xorg-4291    |               |    __schedule() {
30581.025426 |   0)   Xorg-4291    |   0.705 us    |      _spin_lock_irq();

With this patch, the interrupts output better adapts
to absolute time printing:

  414.856543 |   1)   Xorg-4279    |   8.816 us    |                        }
  414.856544 |   1)   Xorg-4279    |   0.525 us    |                        rcu_irq_exit();
  414.856545 |   1)   Xorg-4279    |   0.526 us    |                        idle_cpu();
  414.856546 |   1)   Xorg-4279    | + 12.157 us   |                      }
  414.856549 |   1)   Xorg-4279    | ! 104.114 us  |                    }
  414.856549 |   1)   Xorg-4279    |   <========== |
  414.856549 |   1)   Xorg-4279    | ! 107.944 us  |                  }
  414.856550 |   1)   Xorg-4279    | ! 137.010 us  |                }
  414.856551 |   1)   Xorg-4279    |   0.624 us    |                _read_unlock();
  414.856552 |   1)   Xorg-4279    | ! 140.930 us  |              }
  414.856552 |   1)   Xorg-4279    | ! 166.159 us  |            }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 50 +++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6c7738e..8f4004a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -351,16 +351,35 @@ print_graph_overhead(unsigned long long duration, struct trace_seq *s)
 	return trace_seq_printf(s, "  ");
 }
 
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
+{
+	unsigned long usecs_rem;
+
+	usecs_rem = do_div(t, NSEC_PER_SEC);
+	usecs_rem /= 1000;
+
+	return trace_seq_printf(s, "%5lu.%06lu |  ",
+			(unsigned long)t, usecs_rem);
+}
+
 static enum print_line_t
-print_graph_irq(struct trace_seq *s, unsigned long addr,
+print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		enum trace_type type, int cpu, pid_t pid)
 {
 	int ret;
+	struct trace_seq *s = &iter->seq;
 
 	if (addr < (unsigned long)__irqentry_text_start ||
 		addr >= (unsigned long)__irqentry_text_end)
 		return TRACE_TYPE_UNHANDLED;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -446,17 +465,6 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 }
 
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
-{
-	unsigned long usecs_rem;
-
-	usecs_rem = do_div(t, 1000000000);
-	usecs_rem /= 1000;
-
-	return trace_seq_printf(s, "%5lu.%06lu |  ",
-			(unsigned long)t, usecs_rem);
-}
-
 /* Case of a leaf function on its call entry */
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
@@ -561,7 +569,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Interrupt */
-	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
+	ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -581,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
+		ret = print_graph_proc(s, pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 
@@ -605,11 +613,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int i;
 	int ret;
 	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private;
+	pid_t *last_pid = iter->private, pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Absolute time */
@@ -668,7 +676,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
+	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -684,6 +692,10 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	int cpu = iter->cpu;
 	pid_t *last_pid = iter->private;
 
+	/* Pid */
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	/* Absolute time */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
@@ -691,10 +703,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
-- 
cgit v1.1


From 00a8bf859331e349713274825e6fbf20bf2ac15a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Feb 2009 13:01:37 +0100
Subject: tracing/function-graph-tracer: fix merge

Merge artifact: pid got changed to ent->pid meanwhile.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8f4004a..c009553 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -589,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, pid);
+		ret = print_graph_proc(s, ent->pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.1


From f9349a8f978929a0c71d2c42ae299f7d462c239d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Feb 2009 21:13:12 +0100
Subject: tracing/function-graph-tracer: make set_graph_function file support
 ftrace regex

Impact: trace only functions matching a pattern

The set_graph_function file let one to trace only one or several
chosen functions and follow all their code flow.

Currently, only a constant function name is allowed so this patch
allows the ftrace_regex functions:

- matches all functions that end with "name":
  echo *name > set_graph_function

- matches all functions that begin with "name":
  echo name* > set_graph_function

- matches all functions that contains "name":
  echo *name* > set_graph_function

Example:

echo mutex* > set_graph_function

 0)               |  mutex_lock_nested() {
 0)   0.563 us    |    __might_sleep();
 0)   2.072 us    |  }
 0)               |  mutex_unlock() {
 0)   1.036 us    |    __mutex_unlock_slowpath();
 0)   2.433 us    |  }
 0)               |  mutex_unlock() {
 0)   0.691 us    |    __mutex_unlock_slowpath();
 0)   1.787 us    |  }
 0)               |  mutex_lock_interruptible_nested() {
 0)   0.548 us    |    __might_sleep();
 0)   1.945 us    |  }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 56 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7dd5a2b..cf59f4c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1895,6 +1895,10 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&graph_lock);
 
+	/* Nothing, tell g_show to print all functions are enabled */
+	if (!ftrace_graph_count && !*pos)
+		return (void *)1;
+
 	p = g_next(m, p, pos);
 
 	return p;
@@ -1913,6 +1917,11 @@ static int g_show(struct seq_file *m, void *v)
 	if (!ptr)
 		return 0;
 
+	if (ptr == (unsigned long *)1) {
+		seq_printf(m, "#### all functions enabled ####\n");
+		return 0;
+	}
+
 	kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
 
 	seq_printf(m, "%s\n", str);
@@ -1966,38 +1975,51 @@ ftrace_graph_read(struct file *file, char __user *ubuf,
 }
 
 static int
-ftrace_set_func(unsigned long *array, int idx, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
-	char str[KSYM_SYMBOL_LEN];
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int search_len;
 	int found = 0;
-	int j;
+	int type, not;
+	char *search;
+	bool exists;
+	int i;
 
 	if (ftrace_disabled)
 		return -ENODEV;
 
+	/* decode regex */
+	type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+	if (not)
+		return -EINVAL;
+
+	search_len = strlen(search);
+
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+			break;
+
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
 			continue;
 
-		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-		if (strcmp(str, buffer) == 0) {
-			/* Return 1 if we add it to the array */
-			found = 1;
-			for (j = 0; j < idx; j++)
-				if (array[j] == rec->ip) {
-					found = 0;
+		if (ftrace_match_record(rec, search, search_len, type)) {
+			/* ensure it is not already in the array */
+			exists = false;
+			for (i = 0; i < *idx; i++)
+				if (array[i] == rec->ip) {
+					exists = true;
 					break;
 				}
-			if (found)
-				array[idx] = rec->ip;
-			goto out;
+			if (!exists) {
+				array[(*idx)++] = rec->ip;
+				found = 1;
+			}
 		}
 	} while_for_each_ftrace_rec();
- out:
+
 	mutex_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
@@ -2066,13 +2088,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	}
 	buffer[index] = 0;
 
-	/* we allow only one at a time */
-	ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+	/* we allow only one expression at a time */
+	ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
 	if (ret)
 		goto out;
 
-	ftrace_graph_count++;
-
 	file->f_pos += read;
 
 	ret = read;
-- 
cgit v1.1


From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:35:06 -0500
Subject: ftrace: allow archs to preform pre and post process for code
 modification

This patch creates the weak functions: ftrace_arch_code_modify_prepare
and ftrace_arch_code_modify_post_process that are called before and
after the stop machine is called to modify the kernel text.

If the arch needs to do pre or post processing, it only needs to define
these functions.

[ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_*
          over using ftrace_arch_modify_* ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913d..72316d9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 	return 1;
 }
 
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+	return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+	return 0;
+}
+
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
@@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
+	int ret;
+
+	ret = ftrace_arch_code_modify_prepare();
+	FTRACE_WARN_ON(ret);
+	if (ret)
+		return;
+
 	stop_machine(__ftrace_modify_code, &command, NULL);
+
+	ret = ftrace_arch_code_modify_post_process();
+	FTRACE_WARN_ON(ret);
 }
 
 static ftrace_func_t saved_ftrace_func;
-- 
cgit v1.1


From 4377245aa93b65b6597e4b7bb460fb9abc48b56b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Feb 2009 13:41:27 -0500
Subject: ftrace: break out modify loop immediately on detection of error

Impact: added precaution on failure detection

Break out of the modifying loop as soon as a failure is detected.
This is just an added precaution found by code review and was not
found by any bug chasing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72316d9..11ad796 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -561,8 +561,11 @@ static void ftrace_replace_code(int enable)
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
-				} else
+				} else {
 					ftrace_bug(failed, rec->ip);
+					/* Stop processing */
+					return;
+				}
 			}
 		}
 	}
-- 
cgit v1.1


From 5e01cb695d29619dd551bac7d6aa4ef1dc8ebc95 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 24 Feb 2009 13:55:18 +0100
Subject: x86, ftrace: fix section mismatch in hw-branch-tracer

Fix an invalid memory reference problem when cpu hotplug support is
disabled and the hw-branch-tracer is set as current tracer.

Initializing the tracer calls bts_trace_init() which has already
been freed at this time.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3561aac..3335e80 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -127,20 +127,18 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int __cpuinit bts_trace_init(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
 	hw_branch_trace = tr;
 
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	bts_trace_start(tr);
 
 	return 0;
 }
 
-static void __cpuinit bts_trace_reset(struct trace_array *tr)
+static void bts_trace_reset(struct trace_array *tr)
 {
 	bts_trace_stop(tr);
-	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
 }
 
 static void bts_trace_print_header(struct seq_file *m)
@@ -299,6 +297,7 @@ struct tracer bts_tracer __read_mostly =
 
 __init static int init_bts_trace(void)
 {
+	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	return register_tracer(&bts_tracer);
 }
 device_initcall(init_bts_trace);
-- 
cgit v1.1


From b77e38aa240c3bd9c55c98b9f7c81541e042eae5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 10:21:36 -0500
Subject: tracing: add event trace infrastructure

This patch creates the event tracing infrastructure of ftrace.
It will create the files:

 /debug/tracing/available_events
 /debug/tracing/set_event

The available_events will list the trace points that have been
registered with the event tracer.

set_events will allow the user to enable or disable an event hook.

example:

 # echo sched_wakeup > /debug/tracing/set_event

Will enable the sched_wakeup event (if it is registered).

 # echo "!sched_wakeup" >> /debug/tracing/set_event

Will disable the sched_wakeup event (and only that event).

 # echo > /debug/tracing/set_event

Will disable all events (notice the '>')

 # cat /debug/tracing/available_events > /debug/tracing/set_event

Will enable all registered event hooks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Kconfig        |   9 ++
 kernel/trace/Makefile       |   1 +
 kernel/trace/trace_events.c | 280 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.h |  52 ++++++++
 4 files changed, 342 insertions(+)
 create mode 100644 kernel/trace/trace_events.c
 create mode 100644 kernel/trace/trace_events.h

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 07877f4..999c6a2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,6 +159,15 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config EVENT_TRACER
+	bool "Trace various events in the kernel"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer hooks to various trace points in the kernel
+	  allowing the user to pick and choose which trace point they
+	  want to trace.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 627090b..c736356 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -38,5 +38,6 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 0000000..05bc80e
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,280 @@
+/*
+ * event tracer
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+
+#include "trace_events.h"
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	tracing_record_cmdline(current);
+	trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+}
+
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+
+
+	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		call++;
+	}
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
+	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+
+
+	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+
+		if (strcmp(buf, call->name) != 0) {
+			call++;
+			continue;
+		}
+
+		if (set) {
+			/* Already set? */
+			if (call->enabled)
+				return 0;
+			call->enabled = 1;
+			call->regfunc();
+		} else {
+			/* Already cleared? */
+			if (!call->enabled)
+				return 0;
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/* 128 should be much more than enough */
+#define EVENT_BUF_SIZE		127
+
+static ssize_t
+ftrace_event_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	size_t read = 0;
+	int i, set = 1;
+	ssize_t ret;
+	char *buf;
+	char ch;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		return ret;
+	read++;
+	cnt--;
+
+	/* skip white space */
+	while (cnt && isspace(ch)) {
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			return ret;
+		read++;
+		cnt--;
+	}
+
+	/* Only white space found? */
+	if (isspace(ch)) {
+		file->f_pos += read;
+		ret = read;
+		return ret;
+	}
+
+	buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (cnt > EVENT_BUF_SIZE)
+		cnt = EVENT_BUF_SIZE;
+
+	i = 0;
+	while (cnt && !isspace(ch)) {
+		if (!i && ch == '!')
+			set = 0;
+		else
+			buf[i++] = ch;
+
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out_free;
+		read++;
+		cnt--;
+	}
+	buf[i] = 0;
+
+	file->f_pos += read;
+
+	ret = ftrace_set_clr_event(buf, set);
+	if (ret)
+		goto out_free;
+
+	ret = read;
+
+ out_free:
+	kfree(buf);
+
+	return ret;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_call *next = call;
+
+	(*pos)++;
+
+	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		return NULL;
+
+	m->private = ++next;
+
+	return call;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	return t_next(m, NULL, pos);
+}
+
+static void *
+s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_call *next;
+
+	(*pos)++;
+
+ retry:
+	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		return NULL;
+
+	if (!call->enabled) {
+		call++;
+		goto retry;
+	}
+
+	next = call;
+	m->private = ++next;
+
+	return call;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	return s_next(m, NULL, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_event_call *call = v;
+
+	seq_printf(m, "%s\n", call->name);
+
+	return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int
+ftrace_event_seq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	const struct seq_operations *seq_ops;
+
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_clear_events();
+
+	seq_ops = inode->i_private;
+	ret = seq_open(file, seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = __start_ftrace_events;
+	}
+	return ret;
+}
+
+static const struct seq_operations show_event_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static const struct seq_operations show_set_event_seq_ops = {
+	.start = s_start,
+	.next = s_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static const struct file_operations ftrace_avail_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static const struct file_operations ftrace_set_event_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.write = ftrace_event_write,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int event_trace_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("available_events", 0444, d_tracer,
+				    (void *)&show_event_seq_ops,
+				    &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_events' entry\n");
+
+	entry = debugfs_create_file("set_event", 0644, d_tracer,
+				    (void *)&show_set_event_seq_ops,
+				    &ftrace_set_event_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_event' entry\n");
+
+	return 0;
+}
+fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
new file mode 100644
index 0000000..39342f8
--- /dev/null
+++ b/kernel/trace/trace_events.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KERNEL_TRACE_EVENTS_H
+#define _LINUX_KERNEL_TRACE_EVENTS_H
+
+#include <linux/ftrace.h>
+#include "trace.h"
+
+struct ftrace_event_call {
+	char		*name;
+	int		enabled;
+	int		(*regfunc)(void);
+	void		(*unregfunc)(void);
+};
+
+
+#undef TPFMT
+#define TPFMT(fmt, args...)	fmt "\n", ##args
+
+#undef DEFINE_TRACE_FMT
+#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+}
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...);
+extern unsigned long __start_ftrace_events[];
+extern unsigned long __stop_ftrace_events[];
+
+#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
-- 
cgit v1.1


From f3fe8e4a38fd19dbb3f8ffb1826aa840ae304a65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 10:22:57 -0500
Subject: tracing: add schedule events to event trace

This patch changes the trace/sched.h to use the DECLARE_TRACE_FMT
such that they are automatically registered with the event tracer.

And it also adds the tracing sched headers to kernel/trace/events.c

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Makefile |  1 +
 kernel/trace/events.c | 13 +++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 kernel/trace/events.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c736356..664b6c0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -39,5 +39,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+obj-$(CONFIG_EVENT_TRACER) += events.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
new file mode 100644
index 0000000..38c89ee
--- /dev/null
+++ b/kernel/trace/events.c
@@ -0,0 +1,13 @@
+/*
+ * This is the place to register all trace points as events.
+ * Include the trace/<type>.h at the top.
+ * Include the trace/<type>_event_types.h at the bottom.
+ */
+
+/* trace/<type>.h here */
+#include <trace/sched.h>
+
+#include "trace_events.h"
+
+/* trace/<type>_event_types.h here */
+#include <trace/sched_event_types.h>
-- 
cgit v1.1


From 1473e4417c79f12d91ef91a469699bfa911f510f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 14:15:08 -0500
Subject: tracing: make event directory structure

This patch adds the directory /debug/tracing/events/ that will contain
all the registered trace points.

 # ls /debug/tracing/events/
sched_kthread_stop      sched_process_fork  sched_switch
sched_kthread_stop_ret  sched_process_free  sched_wait_task
sched_migrate_task      sched_process_wait  sched_wakeup
sched_process_exit      sched_signal_send   sched_wakeup_new

 # ls /debug/tracing/events/sched_switch/
enable

 # cat /debug/tracing/events/sched_switch/enable
1

 # cat /debug/tracing/set_event
sched_switch

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 137 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_events.h |   7 ++-
 2 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05bc80e..3bcb9df 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,11 @@
 
 #include "trace_events.h"
 
+#define events_for_each(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	va_list ap;
@@ -39,15 +44,16 @@ static void ftrace_clear_events(void)
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+	struct ftrace_event_call *call = __start_ftrace_events;
 
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	events_for_each(call) {
 
-		if (strcmp(buf, call->name) != 0) {
-			call++;
+		if (!call->name)
+			continue;
+
+		if (strcmp(buf, call->name) != 0)
 			continue;
-		}
 
 		if (set) {
 			/* Already set? */
@@ -223,6 +229,67 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static ssize_t
+event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char *buf;
+
+	if (call->enabled)
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		if (!call->enabled)
+			break;
+
+		call->enabled = 0;
+		call->unregfunc();
+		break;
+	case 1:
+		if (call->enabled)
+			break;
+
+		call->enabled = 1;
+		call->regfunc();
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -252,10 +319,59 @@ static const struct file_operations ftrace_set_event_fops = {
 	.release = seq_release,
 };
 
+static const struct file_operations ftrace_enable_fops = {
+	.open = tracing_open_generic,
+	.read = event_enable_read,
+	.write = event_enable_write,
+};
+
+static struct dentry *event_trace_events_dir(void)
+{
+	static struct dentry *d_tracer;
+	static struct dentry *d_events;
+
+	if (d_events)
+		return d_events;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return NULL;
+
+	d_events = debugfs_create_dir("events", d_tracer);
+	if (!d_events)
+		pr_warning("Could not create debugfs "
+			   "'events' directory\n");
+
+	return d_events;
+}
+
+static int
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+{
+	struct dentry *entry;
+
+	call->dir = debugfs_create_dir(call->name, d_events);
+	if (!call->dir) {
+		pr_warning("Could not create debugfs "
+			   "'%s' directory\n", call->name);
+		return -1;
+	}
+
+	entry = debugfs_create_file("enable", 0644, call->dir, call,
+				    &ftrace_enable_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/enable' entry\n", call->name);
+
+	return 0;
+}
+
 static __init int event_trace_init(void)
 {
+	struct ftrace_event_call *call = __start_ftrace_events;
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	struct dentry *d_events;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -275,6 +391,17 @@ static __init int event_trace_init(void)
 		pr_warning("Could not create debugfs "
 			   "'set_event' entry\n");
 
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return 0;
+
+	events_for_each(call) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		event_create_dir(call, d_events);
+	}
+
 	return 0;
 }
 fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index 39342f8..cb8455b 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -1,11 +1,13 @@
 #ifndef _LINUX_KERNEL_TRACE_EVENTS_H
 #define _LINUX_KERNEL_TRACE_EVENTS_H
 
+#include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include "trace.h"
 
 struct ftrace_event_call {
 	char		*name;
+	struct dentry	*dir;
 	int		enabled;
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
@@ -39,6 +41,7 @@ static void ftrace_unreg_event_##call(void)				\
 }									\
 									\
 static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.regfunc		= ftrace_reg_event_##call,		\
@@ -46,7 +49,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 }
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern unsigned long __start_ftrace_events[];
-extern unsigned long __stop_ftrace_events[];
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
 
 #endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
-- 
cgit v1.1


From 2d542cf34264ac92e9e7ac55c0b096b066d569d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 08:40:09 +0100
Subject: tracing/hw-branch-tracing: convert bts-tracer mutex to a spinlock

Impact: fix CPU hotplug lockup

bts_hotcpu_handler() is called with irqs disabled, so using mutex_lock()
is a no-no.

All the BTS codepaths here are atomic (they do not schedule), so using
a spinlock is the right solution.

Cc: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 57 ++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3335e80..7bfdf4c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -3,17 +3,15 @@
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- *
  */
-
-#include <linux/module.h>
-#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/mutex.h>
+#include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/fs.h>
 
 #include <asm/ds.h>
 
@@ -23,16 +21,17 @@
 
 #define SIZEOF_BTS (1 << 13)
 
-/* The tracer mutex protects the below per-cpu tracer array.
-   It needs to be held to:
-   - start tracing on all cpus
-   - stop tracing on all cpus
-   - start tracing on a single hotplug cpu
-   - stop tracing on a single hotplug cpu
-   - read the trace from all cpus
-   - read the trace from a single cpu
-*/
-static DEFINE_MUTEX(bts_tracer_mutex);
+/*
+ * The tracer lock protects the below per-cpu tracer array.
+ * It needs to be held to:
+ * - start tracing on all cpus
+ * - stop tracing on all cpus
+ * - start tracing on a single hotplug cpu
+ * - stop tracing on a single hotplug cpu
+ * - read the trace from all cpus
+ * - read the trace from a single cpu
+ */
+static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 
@@ -47,7 +46,7 @@ static struct trace_array *hw_branch_trace __read_mostly;
  * Start tracing on the current cpu.
  * The argument is ignored.
  *
- * pre: bts_tracer_mutex must be locked.
+ * pre: bts_tracer_lock must be locked.
  */
 static void bts_trace_start_cpu(void *arg)
 {
@@ -66,19 +65,19 @@ static void bts_trace_start_cpu(void *arg)
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	on_each_cpu(bts_trace_start_cpu, NULL, 1);
 	trace_hw_branches_enabled = 1;
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 /*
  * Stop tracing on the current cpu.
  * The argument is ignored.
  *
- * pre: bts_tracer_mutex must be locked.
+ * pre: bts_tracer_lock must be locked.
  */
 static void bts_trace_stop_cpu(void *arg)
 {
@@ -90,12 +89,12 @@ static void bts_trace_stop_cpu(void *arg)
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	trace_hw_branches_enabled = 0;
 	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
@@ -103,7 +102,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 {
 	unsigned int cpu = (unsigned long)hcpu;
 
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	if (!trace_hw_branches_enabled)
 		goto out;
@@ -119,7 +118,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 	}
 
  out:
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 }
 
@@ -225,7 +224,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_mutex must be locked
+ * pre: bts_tracer_lock must be locked
  */
 static void trace_bts_cpu(void *arg)
 {
@@ -261,11 +260,11 @@ out:
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -275,11 +274,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	trace_bts_cpu(hw_branch_trace);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 struct tracer bts_tracer __read_mostly =
-- 
cgit v1.1


From 886b5b73d71e4027d7dc6c14f5f7ab102201ea6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 11:03:44 +0100
Subject: tracing: remove /debug/tracing/latency_trace

Impact: remove old debug/tracing API

/debug/tracing/latency_trace is an old legacy format we kept from
the old latency tracer. Remove the file for now. If there's any
useful bit missing then we'll propagate any useful output bits into
the /debug/tracing/trace output.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e1f3b99..11ba100 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2938,11 +2938,6 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
-	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
-				    &global_trace, &tracing_lt_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'latency_trace' entry\n");
-
 	entry = debugfs_create_file("trace", 0444, d_tracer,
 				    &global_trace, &tracing_fops);
 	if (!entry)
-- 
cgit v1.1


From 15d0d3b3371227f846b9f644547fde081c7e1c0c Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 25 Feb 2009 06:22:45 +0100
Subject: generic IPI: simplify barriers and locking

Simplify the barriers in generic remote function call interrupt
code.

Firstly, just unconditionally take the lock and check the list
in the generic_call_function_single_interrupt IPI handler. As
we've just taken an IPI here, the chances are fairly high that
there will be work on the list for us, so do the locking
unconditionally. This removes the tricky lockless list_empty
check and dubious barriers. The change looks bigger than it is
because it is just removing an outer loop.

Secondly, clarify architecture specific IPI locking rules.
Generic code has no tools to impose any sane ordering on IPIs if
they go outside normal cache coherency, ergo the arch code must
make them appear to obey cache coherency as a "memory operation"
to initiate an IPI, and a "memory operation" to receive one.
This way at least they can be reasoned about in generic code,
and smp_mb used to provide ordering.

The combination of these two changes means that explict barriers
can be taken out of queue handling for the single case -- shared
data is explicitly locked, and ipi ordering must conform to
that, so no barriers needed. An extra barrier is needed in the
many handler, so as to ensure we load the list element after the
IPI is received.

Does any architecture actually *need* these barriers? For the
initiator I could see it, but for the handler I would be
surprised. So the other thing we could do for simplicity is just
to require that, rather than just matching with cache coherency,
we just require a full barrier before generating an IPI, and
after receiving an IPI. In which case, the smp_mb()s can go
away. But just for now, we'll be on the safe side and use the
barriers (they're in the slow case anyway).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-arch@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 83 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index bbedbb7..6ecf4b9 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,9 +74,16 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 	spin_unlock_irqrestore(&dst->lock, flags);
 
 	/*
-	 * Make the list addition visible before sending the ipi.
+	 * The list addition should be visible before sending the IPI
+	 * handler locks the list to pull the entry off it because of
+	 * normal cache coherency rules implied by spinlocks.
+	 *
+	 * If IPIs can go out of order to the cache coherency protocol
+	 * in an architecture, sufficient synchronisation should be added
+	 * to arch code to make it appear to obey cache coherency WRT
+	 * locking and barrier primitives. Generic code isn't really equipped
+	 * to do the right thing...
 	 */
-	smp_mb();
 
 	if (ipi)
 		arch_send_call_function_single_ipi(cpu);
@@ -104,6 +111,14 @@ void generic_smp_call_function_interrupt(void)
 	int cpu = get_cpu();
 
 	/*
+	 * Ensure entry is visible on call_function_queue after we have
+	 * entered the IPI. See comment in smp_call_function_many.
+	 * If we don't have this, then we may miss an entry on the list
+	 * and never get another IPI to process it.
+	 */
+	smp_mb();
+
+	/*
 	 * It's ok to use list_for_each_rcu() here even though we may delete
 	 * 'pos', since list_del_rcu() doesn't clear ->next
 	 */
@@ -154,49 +169,37 @@ void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
 	LIST_HEAD(list);
+	unsigned int data_flags;
 
-	/*
-	 * Need to see other stores to list head for checking whether
-	 * list is empty without holding q->lock
-	 */
-	smp_read_barrier_depends();
-	while (!list_empty(&q->list)) {
-		unsigned int data_flags;
-
-		spin_lock(&q->lock);
-		list_replace_init(&q->list, &list);
-		spin_unlock(&q->lock);
+	spin_lock(&q->lock);
+	list_replace_init(&q->list, &list);
+	spin_unlock(&q->lock);
 
-		while (!list_empty(&list)) {
-			struct call_single_data *data;
+	while (!list_empty(&list)) {
+		struct call_single_data *data;
 
-			data = list_entry(list.next, struct call_single_data,
-						list);
-			list_del(&data->list);
+		data = list_entry(list.next, struct call_single_data,
+					list);
+		list_del(&data->list);
 
-			/*
-			 * 'data' can be invalid after this call if
-			 * flags == 0 (when called through
-			 * generic_exec_single(), so save them away before
-			 * making the call.
-			 */
-			data_flags = data->flags;
-
-			data->func(data->info);
-
-			if (data_flags & CSD_FLAG_WAIT) {
-				smp_wmb();
-				data->flags &= ~CSD_FLAG_WAIT;
-			} else if (data_flags & CSD_FLAG_LOCK) {
-				smp_wmb();
-				data->flags &= ~CSD_FLAG_LOCK;
-			} else if (data_flags & CSD_FLAG_ALLOC)
-				kfree(data);
-		}
 		/*
-		 * See comment on outer loop
+		 * 'data' can be invalid after this call if
+		 * flags == 0 (when called through
+		 * generic_exec_single(), so save them away before
+		 * making the call.
 		 */
-		smp_read_barrier_depends();
+		data_flags = data->flags;
+
+		data->func(data->info);
+
+		if (data_flags & CSD_FLAG_WAIT) {
+			smp_wmb();
+			data->flags &= ~CSD_FLAG_WAIT;
+		} else if (data_flags & CSD_FLAG_LOCK) {
+			smp_wmb();
+			data->flags &= ~CSD_FLAG_LOCK;
+		} else if (data_flags & CSD_FLAG_ALLOC)
+			kfree(data);
 	}
 }
 
@@ -375,6 +378,8 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/*
 	 * Make the list addition visible before sending the ipi.
+	 * (IPIs must obey or appear to obey normal Linux cache coherency
+	 * rules -- see comment in generic_exec_single).
 	 */
 	smp_mb();
 
-- 
cgit v1.1


From b04cc6b1f6398b0e0b60d37e27ce51b4899672ec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Feb 2009 03:22:28 +0100
Subject: tracing/core: introduce per cpu tracing files

Impact: split up tracing output per cpu

Currently, on the tracing debugfs directory, three files are
available to the user to let him extracting the trace output:

- trace is an iterator through the ring-buffer. It's a reader
  but not a consumer It doesn't block when no more traces are
  available.

- trace pretty similar to the former, except that it adds more
  informations such as prempt count, irq flag, ...

- trace_pipe is a reader and a consumer, it will also block
  waiting for traces if necessary (heh, yes it's a pipe).

The traces coming from different cpus are curretly mixed up
inside these files. Sometimes it messes up the informations,
sometimes it's useful, depending on what does the tracer
capture.

The tracing_cpumask file is useful to filter the output and
select only the traces captured a custom defined set of cpus.
But still it is not enough powerful to extract at the same time
one trace buffer per cpu.

So this patch creates a new directory: /debug/tracing/per_cpu/.

Inside this directory, you will now find one trace_pipe file and
one trace file per cpu.

Which means if you have two cpus, you will have:

 trace0
 trace1
 trace_pipe0
 trace_pipe1

And of course, reading these files will have the same effect
than with the usual tracing files, except that you will only see
the traces from the given cpu.

The original all-in-one cpu trace file are still available on
their original place.

Until now, only one consumer was allowed on trace_pipe to avoid
racy consuming on the ring-buffer. Now the approach changed a
bit, you can have only one consumer per cpu.

Which means you are allowed to read concurrently trace_pipe0 and
trace_pipe1 But you can't have two readers on trace_pipe0 or
trace_pipe1.

Following the same logic, if there is one reader on the common
trace_pipe, you can not have at the same time another reader on
trace_pipe0 or in trace_pipe1. Because in trace_pipe is already
a consumer in all cpu buffers in essence.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 168 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/trace/trace.h |   3 +
 2 files changed, 147 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 11ba100..aa58b7b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -98,6 +98,9 @@ static inline void ftrace_enable_cpu(void)
 
 static cpumask_var_t __read_mostly	tracing_buffer_mask;
 
+/* Define which cpu buffers are currently read in trace_pipe */
+static cpumask_var_t			tracing_reader_cpumask;
+
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu(cpu, tracing_buffer_mask)
 
@@ -1195,10 +1198,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
 	int cpu;
 
+	/*
+	 * If we are in a per_cpu trace file, don't bother by iterating over
+	 * all cpu and peek directly.
+	 */
+	if (cpu_file > TRACE_PIPE_ALL_CPU) {
+		if (ring_buffer_empty_cpu(buffer, cpu_file))
+			return NULL;
+		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		if (ent_cpu)
+			*ent_cpu = cpu_file;
+
+		return ent;
+	}
+
 	for_each_tracing_cpu(cpu) {
 
 		if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1279,6 +1297,7 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
@@ -1299,9 +1318,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 		ftrace_disable_cpu();
 
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_iter_reset(iter->buffer_iter[cpu]);
-		}
+		if (cpu_file == TRACE_PIPE_ALL_CPU) {
+			for_each_tracing_cpu(cpu)
+				ring_buffer_iter_reset(iter->buffer_iter[cpu]);
+		} else
+			ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
+
 
 		ftrace_enable_cpu();
 
@@ -1653,6 +1675,7 @@ static struct seq_operations tracer_seq_ops = {
 static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, int *ret)
 {
+	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
 	struct seq_file *m;
 	int cpu;
@@ -1672,9 +1695,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
-		iter->tr = inode->i_private;
+		iter->tr = &global_trace;
 	iter->trace = current_trace;
 	iter->pos = -1;
+	iter->cpu_file = cpu_file;
 
 	/* Notify the tracer early; before we stop tracing. */
 	if (iter->trace && iter->trace->open)
@@ -1684,14 +1708,22 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	if (ring_buffer_overruns(iter->tr->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
+	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+		for_each_tracing_cpu(cpu) {
 
-	for_each_tracing_cpu(cpu) {
+			iter->buffer_iter[cpu] =
+				ring_buffer_read_start(iter->tr->buffer, cpu);
 
+			if (!iter->buffer_iter[cpu])
+				goto fail_buffer;
+		}
+	} else {
+		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_start(iter->tr->buffer, cpu);
+				ring_buffer_read_start(iter->tr->buffer, cpu);
 
 		if (!iter->buffer_iter[cpu])
-			goto fail_buffer;
+			goto fail;
 	}
 
 	/* TODO stop tracer */
@@ -1715,6 +1747,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
+fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter);
 
@@ -2325,54 +2358,77 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static atomic_t tracing_reader;
-
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
+	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
+	int ret = 0;
 
 	if (tracing_disabled)
 		return -ENODEV;
 
-	/* We only allow for reader of the pipe */
-	if (atomic_inc_return(&tracing_reader) != 1) {
-		atomic_dec(&tracing_reader);
-		return -EBUSY;
+	mutex_lock(&trace_types_lock);
+
+	/* We only allow one reader per cpu */
+	if (cpu_file == TRACE_PIPE_ALL_CPU) {
+		if (!cpumask_empty(tracing_reader_cpumask)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		cpumask_setall(tracing_reader_cpumask);
+	} else {
+		if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
+			cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
+		else {
+			ret = -EBUSY;
+			goto out;
+		}
 	}
 
 	/* create a buffer to store the information to pass to userspace */
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
+	if (!iter) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
 		kfree(iter);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	mutex_lock(&trace_types_lock);
-
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
 	filp->private_data = iter;
 
 	if (iter->trace->pipe_open)
 		iter->trace->pipe_open(iter);
-	mutex_unlock(&trace_types_lock);
 
-	return 0;
+out:
+	mutex_unlock(&trace_types_lock);
+	return ret;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter = file->private_data;
 
+	mutex_lock(&trace_types_lock);
+
+	if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
+		cpumask_clear(tracing_reader_cpumask);
+	else
+		cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+
+	mutex_unlock(&trace_types_lock);
+
 	free_cpumask_var(iter->started);
 	kfree(iter);
-	atomic_dec(&tracing_reader);
 
 	return 0;
 }
@@ -2911,6 +2967,59 @@ struct dentry *tracing_init_dentry(void)
 	return d_tracer;
 }
 
+static struct dentry *d_percpu;
+
+struct dentry *tracing_dentry_percpu(void)
+{
+	static int once;
+	struct dentry *d_tracer;
+
+	if (d_percpu)
+		return d_percpu;
+
+	d_tracer = tracing_init_dentry();
+
+	if (!d_tracer)
+		return NULL;
+
+	d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+
+	if (!d_percpu && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'per_cpu'\n");
+		return NULL;
+	}
+
+	return d_percpu;
+}
+
+static void tracing_init_debugfs_percpu(long cpu)
+{
+	struct dentry *d_percpu = tracing_dentry_percpu();
+	struct dentry *entry;
+	/* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */
+	char filename[17];
+
+	if (cpu > 999 || cpu < 0)
+		return;
+
+	/* per cpu trace_pipe */
+	sprintf(filename, "trace_pipe%ld", cpu);
+
+	entry = debugfs_create_file(filename, 0444, d_percpu,
+				(void *) cpu, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs '%s' entry\n", filename);
+
+	/* per cpu trace */
+	sprintf(filename, "trace%ld", cpu);
+
+	entry = debugfs_create_file(filename, 0444, d_percpu,
+				(void *) cpu, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs '%s' entry\n", filename);
+}
+
 #ifdef CONFIG_FTRACE_SELFTEST
 /* Let selftest have access to static functions in this file */
 #include "trace_selftest.c"
@@ -2920,6 +3029,7 @@ static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	int cpu;
 
 	d_tracer = tracing_init_dentry();
 
@@ -2939,7 +3049,7 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
 	entry = debugfs_create_file("trace", 0444, d_tracer,
-				    &global_trace, &tracing_fops);
+				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
 
@@ -2970,8 +3080,8 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'README' entry\n");
 
-	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
-				    NULL, &tracing_pipe_fops);
+	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'trace_pipe' entry\n");
@@ -2999,6 +3109,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
+
+	for_each_tracing_cpu(cpu)
+		tracing_init_debugfs_percpu(cpu);
+
 	return 0;
 }
 
@@ -3222,8 +3336,12 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
 		goto out_free_buffer_mask;
 
+	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
+		goto out_free_tracing_cpumask;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
+	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
@@ -3272,6 +3390,8 @@ __init static int tracer_alloc_buffers(void)
 	ret = 0;
 
 out_free_cpumask:
+	free_cpumask_var(tracing_reader_cpumask);
+out_free_tracing_cpumask:
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index eed732c..508235a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,8 @@ struct trace_seq {
 	unsigned int		readpos;
 };
 
+#define TRACE_PIPE_ALL_CPU	-1
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
@@ -404,6 +406,7 @@ struct trace_iterator {
 	struct tracer		*trace;
 	void			*private;
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	int			cpu_file;
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
-- 
cgit v1.1


From d7350c3f45694104e820041969c8185c5f99e57c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Feb 2009 06:13:16 +0100
Subject: tracing/core: make the read callbacks reentrants

Now that several per-cpu files can be read or spliced at the
same, we want the read/splice callbacks for tracing files to be
reentrants.

Until now, a single global mutex (trace_types_lock) serialized
the access to tracing_read_pipe(), tracing_splice_read_pipe(),
and the seq helpers.

Ie: it means that if a user tries to read trace_pipe0 and
trace_pipe1 at the same time, the access to the function
tracing_read_pipe() is contended and one reader must wait for
the other to finish its read call.

The trace_type_lock mutex is mostly here to serialize the access
to the global current tracer (current_trace), which can be
changed concurrently. Although the iter struct keeps a private
pointer to this tracer, its callbacks can be changed by another
function.

The method used here is to not keep anymore private reference to
the tracer inside the iterator but to make a copy of it inside
the iterator. Then it checks on subsequents read calls if the
tracer has changed. This is not costly because the current
tracer is not expected to be changed often, so we use a branch
prediction for that.

Moreover, we add a private mutex to the iterator (there is one
iterator per file descriptor) to serialize the accesses in case
of multiple consumers per file descriptor (which would be a
silly idea from the user). Note that this is not to protect the
ring buffer, since the ring buffer already serializes the
readers accesses. This is to prevent from traces weirdness in
case of concurrent consumers. But these mutexes can be dropped
anyway, that would not result in any crash. Just tell me what
you think about it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.h |   3 +-
 2 files changed, 85 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa58b7b..d8d899f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1294,20 +1294,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 	return ent;
 }
 
+/*
+ * No necessary locking here. The worst thing which can
+ * happen is loosing events consumed at the same time
+ * by a trace_pipe reader.
+ * Other than that, we don't risk to crash the ring buffer
+ * because it serializes the readers.
+ *
+ * The current tracer is copied to avoid a global locking
+ * all around.
+ */
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	static struct tracer *old_tracer;
 	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-
-	if (!current_trace || current_trace != iter->trace) {
-		mutex_unlock(&trace_types_lock);
-		return NULL;
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
 	}
+	mutex_unlock(&trace_types_lock);
 
 	atomic_inc(&trace_record_cmdline_disabled);
 
@@ -1341,7 +1353,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
-	mutex_unlock(&trace_types_lock);
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1691,13 +1702,25 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		goto out;
 	}
 
+	/*
+	 * We make a copy of the current tracer to avoid concurrent
+	 * changes on it while we are reading.
+	 */
 	mutex_lock(&trace_types_lock);
+	iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
+	if (!iter->trace) {
+		*ret = -ENOMEM;
+		goto fail;
+	}
+	if (current_trace)
+		*iter->trace = *current_trace;
+
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
 		iter->tr = &global_trace;
-	iter->trace = current_trace;
 	iter->pos = -1;
+	mutex_init(&iter->mutex);
 	iter->cpu_file = cpu_file;
 
 	/* Notify the tracer early; before we stop tracing. */
@@ -1747,8 +1770,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
-fail:
+ fail:
 	mutex_unlock(&trace_types_lock);
+	kfree(iter->trace);
 	kfree(iter);
 
 	return ERR_PTR(-ENOMEM);
@@ -1783,6 +1807,8 @@ static int tracing_release(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 
 	seq_release(inode, file);
+	mutex_destroy(&iter->mutex);
+	kfree(iter->trace);
 	kfree(iter);
 	return 0;
 }
@@ -2392,10 +2418,21 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
+	/*
+	 * We make a copy of the current tracer to avoid concurrent
+	 * changes on it while we are reading.
+	 */
+	iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
+	if (!iter->trace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	if (current_trace)
+		*iter->trace = *current_trace;
+
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
-		kfree(iter);
 		ret = -ENOMEM;
-		goto out;
+		goto fail;
 	}
 
 	/* trace pipe does not show start of buffer */
@@ -2403,7 +2440,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
-	iter->trace = current_trace;
+	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
 	if (iter->trace->pipe_open)
@@ -2412,6 +2449,12 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
+
+fail:
+	kfree(iter->trace);
+	kfree(iter);
+	mutex_unlock(&trace_types_lock);
+	return ret;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
@@ -2428,6 +2471,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 
 	free_cpumask_var(iter->started);
+	mutex_destroy(&iter->mutex);
+	kfree(iter->trace);
 	kfree(iter);
 
 	return 0;
@@ -2497,18 +2542,15 @@ static int tracing_wait_pipe(struct file *filp)
 			return -EAGAIN;
 		}
 
-		mutex_unlock(&trace_types_lock);
+		mutex_unlock(&iter->mutex);
 
 		iter->trace->wait_pipe(iter);
 
-		mutex_lock(&trace_types_lock);
+		mutex_lock(&iter->mutex);
 
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (iter->trace != current_trace)
-			return 0;
-
 		/*
 		 * We block until we read something and tracing is disabled.
 		 * We still block if tracing is disabled, but we have never
@@ -2533,6 +2575,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	struct trace_iterator *iter = filp->private_data;
+	static struct tracer *old_tracer;
 	ssize_t sret;
 
 	/* return any leftover data */
@@ -2542,7 +2585,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	trace_seq_reset(&iter->seq);
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
+	}
+	mutex_unlock(&trace_types_lock);
+
+	/*
+	 * Avoid more than one consumer on a single file descriptor
+	 * This is just a matter of traces coherency, the ring buffer itself
+	 * is protected.
+	 */
+	mutex_lock(&iter->mutex);
 	if (iter->trace->read) {
 		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
 		if (sret)
@@ -2599,7 +2655,7 @@ waitagain:
 		goto waitagain;
 
 out:
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	return sret;
 }
@@ -2676,11 +2732,20 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		.ops		= &tracing_pipe_buf_ops,
 		.spd_release	= tracing_spd_release_pipe,
 	};
+	static struct tracer *old_tracer;
 	ssize_t ret;
 	size_t rem;
 	unsigned int i;
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
+	}
+	mutex_unlock(&trace_types_lock);
+
+	mutex_lock(&iter->mutex);
 
 	if (iter->trace->splice_read) {
 		ret = iter->trace->splice_read(iter, filp,
@@ -2720,14 +2785,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_reset(&iter->seq);
 	}
 
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
 
 	return splice_to_pipe(pipe, &spd);
 
 out_err:
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 508235a..6321917 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,8 +405,9 @@ struct trace_iterator {
 	struct trace_array	*tr;
 	struct tracer		*trace;
 	void			*private;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
-- 
cgit v1.1


From 8969a5ede0f9e17da4b943712429aef2c9bcd82b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:47 +0100
Subject: generic-ipi: remove kmalloc()

Remove the use of kmalloc() from the smp_call_function_*()
calls.

Steven's generic-ipi patch (d7240b98: generic-ipi: use per cpu
data for single cpu ipi calls) started the discussion on the use
of kmalloc() in this code and fixed the
smp_call_function_single(.wait=0) fallback case.

In this patch we complete this by also providing means for the
_many() call, which fully removes the need for kmalloc() in this
code.

The problem with the _many() call is that other cpus might still
be observing our entry when we're done with it. It solved this
by dynamically allocating data elements and RCU-freeing it.

We solve it by using a single per-cpu entry which provides
static storage and solves one half of the problem (avoiding
referencing freed data).

The other half, ensuring the queue iteration it still possible,
is done by placing re-used entries at the head of the list. This
means that if someone was still iterating that entry when it got
moved, he will now re-visit the entries on the list he had
already seen, but avoids skipping over entries like would have
happened had we placed the new entry at the end.

Furthermore, visiting entries twice is not a problem, since we
remove our cpu from the entry's cpumask once its called.

Many thanks to Oleg for his suggestions and him poking holes in
my earlier attempts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 264 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 166 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 6ecf4b9..7a0ce25 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -10,23 +10,28 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
 
 static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
-static LIST_HEAD(call_function_queue);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+static struct {
+	struct list_head	queue;
+	spinlock_t		lock;
+} call_function __cacheline_aligned_in_smp = {
+	.queue = LIST_HEAD_INIT(call_function.queue),
+	.lock  = __SPIN_LOCK_UNLOCKED(call_function.lock),
+};
 
 enum {
 	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_ALLOC		= 0x02,
-	CSD_FLAG_LOCK		= 0x04,
+	CSD_FLAG_LOCK		= 0x02,
 };
 
 struct call_function_data {
 	struct call_single_data csd;
 	spinlock_t lock;
 	unsigned int refs;
-	struct rcu_head rcu_head;
-	unsigned long cpumask_bits[];
+	cpumask_var_t cpumask;
 };
 
 struct call_single_queue {
@@ -34,8 +39,45 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
+static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
+	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+};
+
+static int
+hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+				cpu_to_node(cpu)))
+			return NOTIFY_BAD;
+		break;
+
+#ifdef CONFIG_CPU_HOTPLUG
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		free_cpumask_var(cfd->cpumask);
+		break;
+#endif
+	};
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+	.notifier_call = hotplug_cfd,
+};
+
 static int __cpuinit init_call_single_data(void)
 {
+	void *cpu = (void *)(long)smp_processor_id();
 	int i;
 
 	for_each_possible_cpu(i) {
@@ -44,18 +86,69 @@ static int __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+
+	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+	register_cpu_notifier(&hotplug_cfd_notifier);
+
 	return 0;
 }
 early_initcall(init_call_single_data);
 
-static void csd_flag_wait(struct call_single_data *data)
+/*
+ * csd_wait/csd_complete are used for synchronous ipi calls
+ */
+static void csd_wait_prepare(struct call_single_data *data)
 {
-	/* Wait for response */
-	do {
-		if (!(data->flags & CSD_FLAG_WAIT))
-			break;
+	data->flags |= CSD_FLAG_WAIT;
+}
+
+static void csd_complete(struct call_single_data *data)
+{
+	if (data->flags & CSD_FLAG_WAIT) {
+		/*
+		 * ensure we're all done before saying we are
+		 */
+		smp_mb();
+		data->flags &= ~CSD_FLAG_WAIT;
+	}
+}
+
+static void csd_wait(struct call_single_data *data)
+{
+	while (data->flags & CSD_FLAG_WAIT)
 		cpu_relax();
-	} while (1);
+}
+
+/*
+ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
+ *
+ * For non-synchronous ipi calls the csd can still be in use by the previous
+ * function call. For multi-cpu calls its even more interesting as we'll have
+ * to ensure no other cpu is observing our csd.
+ */
+static void csd_lock(struct call_single_data *data)
+{
+	while (data->flags & CSD_FLAG_LOCK)
+		cpu_relax();
+	data->flags = CSD_FLAG_LOCK;
+
+	/*
+	 * prevent CPU from reordering the above assignment to ->flags
+	 * with any subsequent assignments to other fields of the
+	 * specified call_single_data structure.
+	 */
+
+	smp_mb();
+}
+
+static void csd_unlock(struct call_single_data *data)
+{
+	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+	/*
+	 * ensure we're all done before releasing data
+	 */
+	smp_mb();
+	data->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
@@ -89,16 +182,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_flag_wait(data);
-}
-
-static void rcu_free_call_data(struct rcu_head *head)
-{
-	struct call_function_data *data;
-
-	data = container_of(head, struct call_function_data, rcu_head);
-
-	kfree(data);
+		csd_wait(data);
 }
 
 /*
@@ -122,41 +206,35 @@ void generic_smp_call_function_interrupt(void)
 	 * It's ok to use list_for_each_rcu() here even though we may delete
 	 * 'pos', since list_del_rcu() doesn't clear ->next
 	 */
-	rcu_read_lock();
-	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
 
-		if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
+		spin_lock(&data->lock);
+		if (!cpumask_test_cpu(cpu, data->cpumask)) {
+			spin_unlock(&data->lock);
 			continue;
+		}
+		cpumask_clear_cpu(cpu, data->cpumask);
+		spin_unlock(&data->lock);
 
 		data->csd.func(data->csd.info);
 
 		spin_lock(&data->lock);
-		cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
 		WARN_ON(data->refs == 0);
-		data->refs--;
-		refs = data->refs;
+		refs = --data->refs;
+		if (!refs) {
+			spin_lock(&call_function.lock);
+			list_del_rcu(&data->csd.list);
+			spin_unlock(&call_function.lock);
+		}
 		spin_unlock(&data->lock);
 
 		if (refs)
 			continue;
 
-		spin_lock(&call_function_lock);
-		list_del_rcu(&data->csd.list);
-		spin_unlock(&call_function_lock);
-
-		if (data->csd.flags & CSD_FLAG_WAIT) {
-			/*
-			 * serialize stores to data with the flag clear
-			 * and wakeup
-			 */
-			smp_wmb();
-			data->csd.flags &= ~CSD_FLAG_WAIT;
-		}
-		if (data->csd.flags & CSD_FLAG_ALLOC)
-			call_rcu(&data->rcu_head, rcu_free_call_data);
+		csd_complete(&data->csd);
+		csd_unlock(&data->csd);
 	}
-	rcu_read_unlock();
 
 	put_cpu();
 }
@@ -192,14 +270,14 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT) {
-			smp_wmb();
-			data->flags &= ~CSD_FLAG_WAIT;
-		} else if (data_flags & CSD_FLAG_LOCK) {
-			smp_wmb();
-			data->flags &= ~CSD_FLAG_LOCK;
-		} else if (data_flags & CSD_FLAG_ALLOC)
-			kfree(data);
+		if (data_flags & CSD_FLAG_WAIT)
+			csd_complete(data);
+
+		/*
+		 * Unlocked CSDs are valid through generic_exec_single()
+		 */
+		if (data_flags & CSD_FLAG_LOCK)
+			csd_unlock(data);
 	}
 }
 
@@ -218,7 +296,9 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			     int wait)
 {
-	struct call_single_data d;
+	struct call_single_data d = {
+		.flags = 0,
+	};
 	unsigned long flags;
 	/* prevent preemption and reschedule on another processor,
 	   as well as CPU removal */
@@ -239,13 +319,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			/*
 			 * We are calling a function on a single CPU
 			 * and we are not going to wait for it to finish.
-			 * We first try to allocate the data, but if we
-			 * fail, we fall back to use a per cpu data to pass
-			 * the information to that CPU. Since all callers
-			 * of this code will use the same data, we must
-			 * synchronize the callers to prevent a new caller
-			 * from corrupting the data before the callee
-			 * can access it.
+			 * We use a per cpu data to pass the information to
+			 * that CPU. Since all callers of this code will
+			 * use the same data, we must synchronize the
+			 * callers to prevent a new caller from corrupting
+			 * the data before the callee can access it.
 			 *
 			 * The CSD_FLAG_LOCK is used to let us know when
 			 * the IPI handler is done with the data.
@@ -255,18 +333,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			 * will make sure the callee is done with the
 			 * data before a new caller will use it.
 			 */
-			data = kmalloc(sizeof(*data), GFP_ATOMIC);
-			if (data)
-				data->flags = CSD_FLAG_ALLOC;
-			else {
-				data = &per_cpu(csd_data, me);
-				while (data->flags & CSD_FLAG_LOCK)
-					cpu_relax();
-				data->flags = CSD_FLAG_LOCK;
-			}
+			data = &__get_cpu_var(csd_data);
+			csd_lock(data);
 		} else {
 			data = &d;
-			data->flags = CSD_FLAG_WAIT;
+			csd_wait_prepare(data);
 		}
 
 		data->func = func;
@@ -326,14 +397,14 @@ void smp_call_function_many(const struct cpumask *mask,
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu;
+	int cpu, next_cpu, me = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
 	/* So, what's a CPU they want?  Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-	if (cpu == smp_processor_id())
+	if (cpu == me)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
 	/* No online cpus?  We're done. */
 	if (cpu >= nr_cpu_ids)
@@ -341,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* Do we have another CPU which isn't us? */
 	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
-	if (next_cpu == smp_processor_id())
+	if (next_cpu == me)
 		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
 
 	/* Fastpath: do that cpu by itself. */
@@ -350,31 +421,28 @@ void smp_call_function_many(const struct cpumask *mask,
 		return;
 	}
 
-	data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
-	if (unlikely(!data)) {
-		/* Slow path. */
-		for_each_online_cpu(cpu) {
-			if (cpu == smp_processor_id())
-				continue;
-			if (cpumask_test_cpu(cpu, mask))
-				smp_call_function_single(cpu, func, info, wait);
-		}
-		return;
-	}
+	data = &__get_cpu_var(cfd_data);
+	csd_lock(&data->csd);
 
-	spin_lock_init(&data->lock);
-	data->csd.flags = CSD_FLAG_ALLOC;
+	spin_lock_irqsave(&data->lock, flags);
 	if (wait)
-		data->csd.flags |= CSD_FLAG_WAIT;
+		csd_wait_prepare(&data->csd);
+
 	data->csd.func = func;
 	data->csd.info = info;
-	cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
-	cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
-	data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
+	cpumask_and(data->cpumask, mask, cpu_online_mask);
+	cpumask_clear_cpu(me, data->cpumask);
+	data->refs = cpumask_weight(data->cpumask);
 
-	spin_lock_irqsave(&call_function_lock, flags);
-	list_add_tail_rcu(&data->csd.list, &call_function_queue);
-	spin_unlock_irqrestore(&call_function_lock, flags);
+	spin_lock(&call_function.lock);
+	/*
+	 * Place entry at the _HEAD_ of the list, so that any cpu still
+	 * observing the entry in generic_smp_call_function_interrupt() will
+	 * not miss any other list entries.
+	 */
+	list_add_rcu(&data->csd.list, &call_function.queue);
+	spin_unlock(&call_function.lock);
+	spin_unlock_irqrestore(&data->lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
@@ -384,11 +452,11 @@ void smp_call_function_many(const struct cpumask *mask,
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
+	arch_send_call_function_ipi_mask(data->cpumask);
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_flag_wait(&data->csd);
+		csd_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
@@ -418,20 +486,20 @@ EXPORT_SYMBOL(smp_call_function);
 
 void ipi_call_lock(void)
 {
-	spin_lock(&call_function_lock);
+	spin_lock(&call_function.lock);
 }
 
 void ipi_call_unlock(void)
 {
-	spin_unlock(&call_function_lock);
+	spin_unlock(&call_function.lock);
 }
 
 void ipi_call_lock_irq(void)
 {
-	spin_lock_irq(&call_function_lock);
+	spin_lock_irq(&call_function.lock);
 }
 
 void ipi_call_unlock_irq(void)
 {
-	spin_unlock_irq(&call_function_lock);
+	spin_unlock_irq(&call_function.lock);
 }
-- 
cgit v1.1


From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:48 +0100
Subject: generic-ipi: remove CSD_FLAG_WAIT

Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework
the code so that we can use CSD_FLAG_LOCK for both purposes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c   |  2 +-
 kernel/smp.c     | 90 +++++++++++++++-----------------------------------------
 kernel/softirq.c |  2 +-
 3 files changed, 25 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec4..d4c2749 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 7a0ce25..f530825 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,8 +23,7 @@ static struct {
 };
 
 enum {
-	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_LOCK		= 0x02,
+	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
@@ -95,41 +94,21 @@ static int __cpuinit init_call_single_data(void)
 early_initcall(init_call_single_data);
 
 /*
- * csd_wait/csd_complete are used for synchronous ipi calls
- */
-static void csd_wait_prepare(struct call_single_data *data)
-{
-	data->flags |= CSD_FLAG_WAIT;
-}
-
-static void csd_complete(struct call_single_data *data)
-{
-	if (data->flags & CSD_FLAG_WAIT) {
-		/*
-		 * ensure we're all done before saying we are
-		 */
-		smp_mb();
-		data->flags &= ~CSD_FLAG_WAIT;
-	}
-}
-
-static void csd_wait(struct call_single_data *data)
-{
-	while (data->flags & CSD_FLAG_WAIT)
-		cpu_relax();
-}
-
-/*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
  * For non-synchronous ipi calls the csd can still be in use by the previous
  * function call. For multi-cpu calls its even more interesting as we'll have
  * to ensure no other cpu is observing our csd.
  */
-static void csd_lock(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *data)
 {
 	while (data->flags & CSD_FLAG_LOCK)
 		cpu_relax();
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+	csd_lock_wait(data);
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
@@ -155,11 +134,12 @@ static void csd_unlock(struct call_single_data *data)
  * Insert a previously allocated call_single_data element for execution
  * on the given CPU. data must already have ->func, ->info, and ->flags set.
  */
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 {
 	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
-	int wait = data->flags & CSD_FLAG_WAIT, ipi;
 	unsigned long flags;
+	int ipi;
 
 	spin_lock_irqsave(&dst->lock, flags);
 	ipi = list_empty(&dst->list);
@@ -182,7 +162,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_wait(data);
+		csd_lock_wait(data);
 }
 
 /*
@@ -232,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
 		if (refs)
 			continue;
 
-		csd_complete(&data->csd);
 		csd_unlock(&data->csd);
 	}
 
@@ -270,9 +249,6 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT)
-			csd_complete(data);
-
 		/*
 		 * Unlocked CSDs are valid through generic_exec_single()
 		 */
@@ -313,36 +289,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		func(info);
 		local_irq_restore(flags);
 	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data;
+		struct call_single_data *data = &d;
 
-		if (!wait) {
-			/*
-			 * We are calling a function on a single CPU
-			 * and we are not going to wait for it to finish.
-			 * We use a per cpu data to pass the information to
-			 * that CPU. Since all callers of this code will
-			 * use the same data, we must synchronize the
-			 * callers to prevent a new caller from corrupting
-			 * the data before the callee can access it.
-			 *
-			 * The CSD_FLAG_LOCK is used to let us know when
-			 * the IPI handler is done with the data.
-			 * The first caller will set it, and the callee
-			 * will clear it. The next caller must wait for
-			 * it to clear before we set it again. This
-			 * will make sure the callee is done with the
-			 * data before a new caller will use it.
-			 */
+		if (!wait)
 			data = &__get_cpu_var(csd_data);
-			csd_lock(data);
-		} else {
-			data = &d;
-			csd_wait_prepare(data);
-		}
+
+		csd_lock(data);
 
 		data->func = func;
 		data->info = info;
-		generic_exec_single(cpu, data);
+		generic_exec_single(cpu, data, wait);
 	} else {
 		err = -ENXIO;	/* CPU not online */
 	}
@@ -362,12 +318,15 @@ EXPORT_SYMBOL(smp_call_function_single);
  * instance.
  *
  */
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+				int wait)
 {
+	csd_lock(data);
+
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+	WARN_ON(wait && irqs_disabled());
 
-	generic_exec_single(cpu, data);
+	generic_exec_single(cpu, data, wait);
 }
 
 /* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
@@ -425,9 +384,6 @@ void smp_call_function_many(const struct cpumask *mask,
 	csd_lock(&data->csd);
 
 	spin_lock_irqsave(&data->lock, flags);
-	if (wait)
-		csd_wait_prepare(&data->csd);
-
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
@@ -456,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_wait(&data->csd);
+		csd_lock_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de..48c3d5d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 		cp->flags = 0;
 		cp->priv = softirq;
 
-		__smp_call_function_single(cpu, cp);
+		__smp_call_function_single(cpu, cp, 0);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.1


From 0b13fda1e0936b3d64c4c407f183d33fa6bd2ad4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 16:52:11 +0100
Subject: generic-ipi: cleanups

Andrew pointed out that there's some small amount of
style rot in kernel/smp.c.

Clean it up.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 162 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 86 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index f530825..7ad2262 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,13 +2,12 @@
  * Generic helpers for smp ipi calls
  *
  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
- *
  */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
 
@@ -17,29 +16,30 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
 	struct list_head	queue;
 	spinlock_t		lock;
-} call_function __cacheline_aligned_in_smp = {
-	.queue = LIST_HEAD_INIT(call_function.queue),
-	.lock  = __SPIN_LOCK_UNLOCKED(call_function.lock),
-};
+} call_function __cacheline_aligned_in_smp =
+	{
+		.queue		= LIST_HEAD_INIT(call_function.queue),
+		.lock		= __SPIN_LOCK_UNLOCKED(call_function.lock),
+	};
 
 enum {
 	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
-	struct call_single_data csd;
-	spinlock_t lock;
-	unsigned int refs;
-	cpumask_var_t cpumask;
+	struct call_single_data	csd;
+	spinlock_t		lock;
+	unsigned int		refs;
+	cpumask_var_t		cpumask;
 };
 
 struct call_single_queue {
-	struct list_head list;
-	spinlock_t lock;
+	struct list_head	list;
+	spinlock_t		lock;
 };
 
 static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
-	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+	.lock			= __SPIN_LOCK_UNLOCKED(cfd_data.lock),
 };
 
 static int
@@ -71,7 +71,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 }
 
 static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
-	.notifier_call = hotplug_cfd,
+	.notifier_call		= hotplug_cfd,
 };
 
 static int __cpuinit init_call_single_data(void)
@@ -96,9 +96,9 @@ early_initcall(init_call_single_data);
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
- * For non-synchronous ipi calls the csd can still be in use by the previous
- * function call. For multi-cpu calls its even more interesting as we'll have
- * to ensure no other cpu is observing our csd.
+ * For non-synchronous ipi calls the csd can still be in use by the
+ * previous function call. For multi-cpu calls its even more interesting
+ * as we'll have to ensure no other cpu is observing our csd.
  */
 static void csd_lock_wait(struct call_single_data *data)
 {
@@ -112,27 +112,29 @@ static void csd_lock(struct call_single_data *data)
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
-	 * prevent CPU from reordering the above assignment to ->flags
-	 * with any subsequent assignments to other fields of the
-	 * specified call_single_data structure.
+	 * prevent CPU from reordering the above assignment
+	 * to ->flags with any subsequent assignments to other
+	 * fields of the specified call_single_data structure:
 	 */
-
 	smp_mb();
 }
 
 static void csd_unlock(struct call_single_data *data)
 {
 	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+
 	/*
-	 * ensure we're all done before releasing data
+	 * ensure we're all done before releasing data:
 	 */
 	smp_mb();
+
 	data->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
- * Insert a previously allocated call_single_data element for execution
- * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ * Insert a previously allocated call_single_data element
+ * for execution on the given CPU. data must already have
+ * ->func, ->info, and ->flags set.
  */
 static
 void generic_exec_single(int cpu, struct call_single_data *data, int wait)
@@ -154,10 +156,9 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 	 * If IPIs can go out of order to the cache coherency protocol
 	 * in an architecture, sufficient synchronisation should be added
 	 * to arch code to make it appear to obey cache coherency WRT
-	 * locking and barrier primitives. Generic code isn't really equipped
-	 * to do the right thing...
+	 * locking and barrier primitives. Generic code isn't really
+	 * equipped to do the right thing...
 	 */
-
 	if (ipi)
 		arch_send_call_function_single_ipi(cpu);
 
@@ -183,8 +184,8 @@ void generic_smp_call_function_interrupt(void)
 	smp_mb();
 
 	/*
-	 * It's ok to use list_for_each_rcu() here even though we may delete
-	 * 'pos', since list_del_rcu() doesn't clear ->next
+	 * It's ok to use list_for_each_rcu() here even though we may
+	 * delete 'pos', since list_del_rcu() doesn't clear ->next
 	 */
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
@@ -219,14 +220,14 @@ void generic_smp_call_function_interrupt(void)
 }
 
 /*
- * Invoked by arch to handle an IPI for call function single. Must be called
- * from the arch with interrupts disabled.
+ * Invoked by arch to handle an IPI for call function single. Must be
+ * called from the arch with interrupts disabled.
  */
 void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
-	LIST_HEAD(list);
 	unsigned int data_flags;
+	LIST_HEAD(list);
 
 	spin_lock(&q->lock);
 	list_replace_init(&q->list, &list);
@@ -235,22 +236,20 @@ void generic_smp_call_function_single_interrupt(void)
 	while (!list_empty(&list)) {
 		struct call_single_data *data;
 
-		data = list_entry(list.next, struct call_single_data,
-					list);
+		data = list_entry(list.next, struct call_single_data, list);
 		list_del(&data->list);
 
 		/*
-		 * 'data' can be invalid after this call if
-		 * flags == 0 (when called through
-		 * generic_exec_single(), so save them away before
-		 * making the call.
+		 * 'data' can be invalid after this call if flags == 0
+		 * (when called through generic_exec_single()),
+		 * so save them away before making the call:
 		 */
 		data_flags = data->flags;
 
 		data->func(data->info);
 
 		/*
-		 * Unlocked CSDs are valid through generic_exec_single()
+		 * Unlocked CSDs are valid through generic_exec_single():
 		 */
 		if (data_flags & CSD_FLAG_LOCK)
 			csd_unlock(data);
@@ -276,34 +275,41 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		.flags = 0,
 	};
 	unsigned long flags;
-	/* prevent preemption and reschedule on another processor,
-	   as well as CPU removal */
-	int me = get_cpu();
+	int this_cpu;
 	int err = 0;
 
+	/*
+	 * prevent preemption and reschedule on another processor,
+	 * as well as CPU removal
+	 */
+	this_cpu = get_cpu();
+
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	if (cpu == me) {
+	if (cpu == this_cpu) {
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
-	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data = &d;
+	} else {
+		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
+			struct call_single_data *data = &d;
 
-		if (!wait)
-			data = &__get_cpu_var(csd_data);
+			if (!wait)
+				data = &__get_cpu_var(csd_data);
 
-		csd_lock(data);
+			csd_lock(data);
 
-		data->func = func;
-		data->info = info;
-		generic_exec_single(cpu, data, wait);
-	} else {
-		err = -ENXIO;	/* CPU not online */
+			data->func = func;
+			data->info = info;
+			generic_exec_single(cpu, data, wait);
+		} else {
+			err = -ENXIO;	/* CPU not online */
+		}
 	}
 
 	put_cpu();
+
 	return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
@@ -313,10 +319,9 @@ EXPORT_SYMBOL(smp_call_function_single);
  * @cpu: The CPU to run on.
  * @data: Pre-allocated and setup data structure
  *
- * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
- * data structure. Useful for embedding @data inside other structures, for
- * instance.
- *
+ * Like smp_call_function_single(), but allow caller to pass in a
+ * pre-allocated data structure. Useful for embedding @data inside
+ * other structures, for instance.
  */
 void __smp_call_function_single(int cpu, struct call_single_data *data,
 				int wait)
@@ -329,10 +334,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	generic_exec_single(cpu, data, wait);
 }
 
-/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
+/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
+
 #ifndef arch_send_call_function_ipi_mask
-#define arch_send_call_function_ipi_mask(maskp) \
-	arch_send_call_function_ipi(*(maskp))
+# define arch_send_call_function_ipi_mask(maskp) \
+	 arch_send_call_function_ipi(*(maskp))
 #endif
 
 /**
@@ -340,7 +346,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * @mask: The set of cpus to run on (only runs on online subset).
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
  *
  * If @wait is true, then returns once @func has returned. Note that @wait
  * will be implicitly turned on in case of allocation failures, since
@@ -351,27 +358,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * must be disabled when calling this function.
  */
 void smp_call_function_many(const struct cpumask *mask,
-			    void (*func)(void *), void *info,
-			    bool wait)
+			    void (*func)(void *), void *info, bool wait)
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu, me = smp_processor_id();
+	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	/* So, what's a CPU they want?  Ignoring this one. */
+	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-	if (cpu == me)
+	if (cpu == this_cpu)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
 	/* No online cpus?  We're done. */
 	if (cpu >= nr_cpu_ids)
 		return;
 
 	/* Do we have another CPU which isn't us? */
 	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
-	if (next_cpu == me)
+	if (next_cpu == this_cpu)
 		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
 
 	/* Fastpath: do that cpu by itself. */
@@ -387,30 +394,31 @@ void smp_call_function_many(const struct cpumask *mask,
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
-	cpumask_clear_cpu(me, data->cpumask);
+	cpumask_clear_cpu(this_cpu, data->cpumask);
 	data->refs = cpumask_weight(data->cpumask);
 
 	spin_lock(&call_function.lock);
 	/*
 	 * Place entry at the _HEAD_ of the list, so that any cpu still
-	 * observing the entry in generic_smp_call_function_interrupt() will
-	 * not miss any other list entries.
+	 * observing the entry in generic_smp_call_function_interrupt()
+	 * will not miss any other list entries:
 	 */
 	list_add_rcu(&data->csd.list, &call_function.queue);
 	spin_unlock(&call_function.lock);
+
 	spin_unlock_irqrestore(&data->lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
-	 * (IPIs must obey or appear to obey normal Linux cache coherency
-	 * rules -- see comment in generic_exec_single).
+	 * (IPIs must obey or appear to obey normal Linux cache
+	 * coherency rules -- see comment in generic_exec_single).
 	 */
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
 	arch_send_call_function_ipi_mask(data->cpumask);
 
-	/* optionally wait for the CPUs to complete */
+	/* Optionally wait for the CPUs to complete */
 	if (wait)
 		csd_lock_wait(&data->csd);
 }
@@ -420,7 +428,8 @@ EXPORT_SYMBOL(smp_call_function_many);
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
  *
  * Returns 0.
  *
@@ -436,6 +445,7 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
 	preempt_disable();
 	smp_call_function_many(cpu_online_mask, func, info, wait);
 	preempt_enable();
+
 	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-- 
cgit v1.1


From eef62a6826b8ab530cefff5aa55c1661a209c803 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:49:52 -0500
Subject: tracing: rename DEFINE_TRACE_FMT to just TRACE_FORMAT

There's been a bit confusion to whether DEFINE/DECLARE_TRACE_FMT should
be a DEFINE or a DECLARE. Ingo Molnar suggested simply calling it
TRACE_FORMAT.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index cb8455b..deb95e5 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -17,8 +17,8 @@ struct ftrace_event_call {
 #undef TPFMT
 #define TPFMT(fmt, args...)	fmt "\n", ##args
 
-#undef DEFINE_TRACE_FMT
-#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
 	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-- 
cgit v1.1


From 8656e7a2fa6afcd8682990f804a2a9674568738f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 26 Feb 2009 00:41:38 +0100
Subject: tracing/core: make the per cpu trace files in per cpu directories

Impact: restructure the VFS layout of per CPU trace buffers

The per cpu trace files are all in a single directory:
/debug/tracing/per_cpu. In case of a large number of cpu, the
content of this directory becomes messy so we create now one
directory per cpu inside /debug/tracing/per_cpu which contain
each their own trace_pipe and trace files.

Ie:

 /debug/tracing$ ls -R per_cpu
 per_cpu:
 cpu0  cpu1

 per_cpu/cpu0:
 trace  trace_pipe

 per_cpu/cpu1:
 trace  trace_pipe

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d8d899f..bdaf60d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3061,28 +3061,31 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry;
-	/* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */
-	char filename[17];
+	struct dentry *entry, *d_cpu;
+	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+	char cpu_dir[7];
 
 	if (cpu > 999 || cpu < 0)
 		return;
 
-	/* per cpu trace_pipe */
-	sprintf(filename, "trace_pipe%ld", cpu);
+	sprintf(cpu_dir, "cpu%ld", cpu);
+	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+	if (!d_cpu) {
+		pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+		return;
+	}
 
-	entry = debugfs_create_file(filename, 0444, d_percpu,
+	/* per cpu trace_pipe */
+	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
 				(void *) cpu, &tracing_pipe_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs '%s' entry\n", filename);
+		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
 
 	/* per cpu trace */
-	sprintf(filename, "trace%ld", cpu);
-
-	entry = debugfs_create_file(filename, 0444, d_percpu,
+	entry = debugfs_create_file("trace", 0444, d_cpu,
 				(void *) cpu, &tracing_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs '%s' entry\n", filename);
+		pr_warning("Could not create debugfs 'trace' entry\n");
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
cgit v1.1


From af39241b90a345556b8884adff87096afe71b050 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 26 Feb 2009 10:11:05 -0500
Subject: tracing, genirq: add irq enter and exit trace events

Impact: add new tracepoints

Add them to the generic IRQ code, that way every architecture
gets these new tracepoints, not just x86.

Using Steve's new 'TRACE_FORMAT', I can get function graph
trace as follows using the original two IRQ tracepoints:

 3)               |    handle_IRQ_event() {
 3)               |    /* (irq_handler_entry) irq=28 handler=eth0 */
 3)               |    e1000_intr_msi() {
 3)   2.460 us    |      __napi_schedule();
 3)   9.416 us    |    }
 3)               |    /* (irq_handler_exit) irq=28 handler=eth0 return=handled */
 3) + 22.935 us   |  }

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c   | 6 ++++++
 kernel/trace/events.c | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d1..4709a7c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <trace/irq.h>
 
 #include "internals.h"
 
@@ -316,6 +317,9 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+DEFINE_TRACE(irq_handler_entry);
+DEFINE_TRACE(irq_handler_exit);
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -332,7 +336,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		local_irq_enable_in_hardirq();
 
 	do {
+		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
+		trace_irq_handler_exit(irq, action, ret);
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 38c89ee..3c75623 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -6,8 +6,10 @@
 
 /* trace/<type>.h here */
 #include <trace/sched.h>
+#include <trace/irq.h>
 
 #include "trace_events.h"
 
 /* trace/<type>_event_types.h here */
 #include <trace/sched_event_types.h>
+#include <trace/irq_event_types.h>
-- 
cgit v1.1


From 6409c4da289d6905f7ae2bd0630438368439bda2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: sched_clock() improvement: use in_nmi()

make sure we dont execute more complex sched_clock() code in NMI context.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852..db69174 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -29,6 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/ktime.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -151,6 +152,13 @@ u64 sched_clock_cpu(int cpu)
 	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock, this_clock, remote_clock;
 
+	/*
+	 * Normally this is not called in NMI context - but if it is,
+	 * trying to do any locking here is totally lethal.
+	 */
+	if (unlikely(in_nmi()))
+		return scd->clock;
+
 	if (unlikely(!sched_clock_running))
 		return 0ull;
 
-- 
cgit v1.1


From 14131f2f98ac350ee9e73faed916d2238a8b6a0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 18:47:11 +0100
Subject: tracing: implement trace_clock_*() APIs

Impact: implement new tracing timestamp APIs

Add three trace clock variants, with differing scalability/precision
tradeoffs:

 -   local: CPU-local trace clock
 -  medium: scalable global clock with some jitter
 -  global: globally monotonic, serialized clock

Make the ring-buffer use the local trace clock internally.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile      |   1 +
 kernel/trace/ring_buffer.c |   5 +--
 kernel/trace/trace_clock.c | 101 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 3 deletions(-)
 create mode 100644 kernel/trace/trace_clock.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 664b6c0..c931fe0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8f19f1a..a8c275c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
 #include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
@@ -12,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* used for sched_clock() (for now) */
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
@@ -112,14 +112,13 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
-/* FIXME!!! */
 u64 ring_buffer_time_stamp(int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = sched_clock() << DEBUG_SHIFT;
+	time = trace_clock_local() << DEBUG_SHIFT;
 	preempt_enable_no_resched_notrace();
 
 	return time;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 0000000..2d4953f
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,101 @@
+/*
+ * tracing clocks
+ *
+ *  Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+	/*
+	 * sched_clock() is an architecture implemented, fast, scalable,
+	 * lockless clock. It is not guaranteed to be coherent across
+	 * CPUs, nor across CPU idle events.
+	 */
+	return sched_clock();
+}
+
+/*
+ * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+	return cpu_clock(raw_smp_processor_id());
+}
+
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+static u64 prev_trace_clock_time;
+
+static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+u64 notrace trace_clock_global(void)
+{
+	unsigned long flags;
+	int this_cpu;
+	u64 now;
+
+	raw_local_irq_save(flags);
+
+	this_cpu = raw_smp_processor_id();
+	now = cpu_clock(this_cpu);
+	/*
+	 * If in an NMI context then dont risk lockups and return the
+	 * cpu_clock() time:
+	 */
+	if (unlikely(in_nmi()))
+		goto out;
+
+	__raw_spin_lock(&trace_clock_lock);
+
+	/*
+	 * TODO: if this happens often then maybe we should reset
+	 * my_scd->clock to prev_trace_clock_time+1, to make sure
+	 * we start ticking with the local clock from now on?
+	 */
+	if ((s64)(now - prev_trace_clock_time) < 0)
+		now = prev_trace_clock_time + 1;
+
+	prev_trace_clock_time = now;
+
+	__raw_spin_unlock(&trace_clock_lock);
+
+ out:
+	raw_local_irq_restore(flags);
+
+	return now;
+}
-- 
cgit v1.1


From a8259075074fb09c230b4cd2c8d3ee3c49d6ecd1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 22:19:12 -0500
Subject: tracing: add options directory and core option files

This patch creates an options directory in the debugfs, that contains
the available tracing options. These files contain 1 or 0, where 1
is the option is enabled and 0 it is disabled.

Simply echoing in 1 will enable the option and 0 will disable it.
This patch only contains the core options, not the tracer options.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bdaf60d..40e983e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3093,6 +3093,121 @@ static void tracing_init_debugfs_percpu(long cpu)
 #include "trace_selftest.c"
 #endif
 
+static ssize_t
+trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
+			loff_t *ppos)
+{
+	long index = (long)filp->private_data;
+	char *buf;
+
+	if (trace_flags & (1 << index))
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
+			 loff_t *ppos)
+{
+	long index = (long)filp->private_data;
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		trace_flags &= ~(1 << index);
+		break;
+	case 1:
+		trace_flags |= 1 << index;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+
+static const struct file_operations trace_options_core_fops = {
+	.open = tracing_open_generic,
+	.read = trace_options_core_read,
+	.write = trace_options_core_write,
+};
+
+static struct dentry *trace_options_init_dentry(void)
+{
+	struct dentry *d_tracer;
+	static struct dentry *t_options;
+
+	if (t_options)
+		return t_options;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return NULL;
+
+	t_options = debugfs_create_dir("options", d_tracer);
+	if (!t_options) {
+		pr_warning("Could not create debugfs directory 'options'\n");
+		return NULL;
+	}
+
+	return t_options;
+}
+
+static struct dentry *
+create_trace_option_core_file(const char *option, long index)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return NULL;
+
+	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+				    &trace_options_core_fops);
+
+	return entry;
+}
+
+static __init void create_trace_options_dir(void)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+	int i;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return;
+
+	for (i = 0; trace_options[i]; i++) {
+		entry = create_trace_option_core_file(trace_options[i], i);
+		if (!entry)
+			pr_warning("Could not create debugfs %s entry\n",
+				   trace_options[i]);
+	}
+}
+
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -3111,6 +3226,8 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace_options' entry\n");
 
+	create_trace_options_dir();
+
 	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
 				    NULL, &tracing_cpumask_fops);
 	if (!entry)
-- 
cgit v1.1


From 577b785f55168d5acb3d123ba41bfe8d7981e044 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 23:43:05 -0500
Subject: tracing: add tracer dependent options to options directory

This patch adds the tracer dependent options dynamically to the
options directory when the tracer is activated. These options are
removed when the tracer is deactivated.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 173 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 40e983e..485c6e7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2275,8 +2275,17 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 	return t->init(tr);
 }
 
+struct trace_option_dentry;
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer);
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts);
+
 static int tracing_set_tracer(const char *buf)
 {
+	static struct trace_option_dentry *topts;
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
 	int ret = 0;
@@ -2297,7 +2306,12 @@ static int tracing_set_tracer(const char *buf)
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
+	destroy_trace_option_files(topts);
+
 	current_trace = t;
+
+	topts = create_trace_option_files(current_trace);
+
 	if (t->init) {
 		ret = tracer_init(t, tr);
 		if (ret)
@@ -3093,6 +3107,95 @@ static void tracing_init_debugfs_percpu(long cpu)
 #include "trace_selftest.c"
 #endif
 
+struct trace_option_dentry {
+	struct tracer_opt		*opt;
+	struct tracer_flags		*flags;
+	struct dentry			*entry;
+};
+
+static ssize_t
+trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
+			loff_t *ppos)
+{
+	struct trace_option_dentry *topt = filp->private_data;
+	char *buf;
+
+	if (topt->flags->val & topt->opt->bit)
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
+			 loff_t *ppos)
+{
+	struct trace_option_dentry *topt = filp->private_data;
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = 0;
+	switch (val) {
+	case 0:
+		/* do nothing if already cleared */
+		if (!(topt->flags->val & topt->opt->bit))
+			break;
+
+		mutex_lock(&trace_types_lock);
+		if (current_trace->set_flag)
+			ret = current_trace->set_flag(topt->flags->val,
+						      topt->opt->bit, 0);
+		mutex_unlock(&trace_types_lock);
+		if (ret)
+			return ret;
+		topt->flags->val &= ~topt->opt->bit;
+		break;
+	case 1:
+		/* do nothing if already set */
+		if (topt->flags->val & topt->opt->bit)
+			break;
+
+		mutex_lock(&trace_types_lock);
+		if (current_trace->set_flag)
+			ret = current_trace->set_flag(topt->flags->val,
+						      topt->opt->bit, 1);
+		mutex_unlock(&trace_types_lock);
+		if (ret)
+			return ret;
+		topt->flags->val |= topt->opt->bit;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+
+static const struct file_operations trace_options_fops = {
+	.open = tracing_open_generic,
+	.read = trace_options_read,
+	.write = trace_options_write,
+};
+
 static ssize_t
 trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
 			loff_t *ppos)
@@ -3146,7 +3249,6 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-
 static const struct file_operations trace_options_core_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_core_read,
@@ -3174,6 +3276,76 @@ static struct dentry *trace_options_init_dentry(void)
 	return t_options;
 }
 
+static void
+create_trace_option_file(struct trace_option_dentry *topt,
+			 struct tracer_flags *flags,
+			 struct tracer_opt *opt)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return;
+
+	topt->flags = flags;
+	topt->opt = opt;
+
+	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+				    &trace_options_fops);
+
+	topt->entry = entry;
+
+}
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer)
+{
+	struct trace_option_dentry *topts;
+	struct tracer_flags *flags;
+	struct tracer_opt *opts;
+	int cnt;
+
+	if (!tracer)
+		return NULL;
+
+	flags = tracer->flags;
+
+	if (!flags || !flags->opts)
+		return NULL;
+
+	opts = flags->opts;
+
+	for (cnt = 0; opts[cnt].name; cnt++)
+		;
+
+	topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL);
+	if (!topts)
+		return NULL;
+
+	for (cnt = 0; opts[cnt].name; cnt++)
+		create_trace_option_file(&topts[cnt], flags,
+					 &opts[cnt]);
+
+	return topts;
+}
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts)
+{
+	int cnt;
+
+	if (!topts)
+		return;
+
+	for (cnt = 0; topts[cnt].opt; cnt++) {
+		if (topts[cnt].entry)
+			debugfs_remove(topts[cnt].entry);
+	}
+
+	kfree(topts);
+}
+
 static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
-- 
cgit v1.1


From d8e83d26b5ab3b31ee0ff6d093a2627707a1e221 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 23:55:58 -0500
Subject: tracing: add protection around open use of current_tracer

Impact: fix to possible race conditions

There's some uses of current_tracer that is not protected by the
trace_types_lock. There is a small chance that a sysadmin changes
the tracer while the current_tracer is being referenced.

If the race is hit, it is unlikely to cause any harm since the
tracers are constant and are not freed. But some strang side
effects may occur.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 485c6e7..6c89ec6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2024,12 +2024,12 @@ static ssize_t
 tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
-	int i;
+	struct tracer_opt *trace_opts;
+	u32 tracer_flags;
+	int len = 0;
 	char *buf;
 	int r = 0;
-	int len = 0;
-	u32 tracer_flags = current_trace->flags->val;
-	struct tracer_opt *trace_opts = current_trace->flags->opts;
+	int i;
 
 
 	/* calculate max size */
@@ -2038,6 +2038,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		len += 3; /* "no" and space */
 	}
 
+	mutex_lock(&trace_types_lock);
+	tracer_flags = current_trace->flags->val;
+	trace_opts = current_trace->flags->opts;
+
 	/*
 	 * Increase the size with names of options specific
 	 * of the current tracer.
@@ -2049,8 +2053,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 
 	/* +2 for \n and \0 */
 	buf = kmalloc(len + 2, GFP_KERNEL);
-	if (!buf)
+	if (!buf) {
+		mutex_unlock(&trace_types_lock);
 		return -ENOMEM;
+	}
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
@@ -2067,6 +2073,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 			r += sprintf(buf + r, "no%s ",
 				trace_opts[i].name);
 	}
+	mutex_unlock(&trace_types_lock);
 
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
@@ -2074,7 +2081,6 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 
 	kfree(buf);
-
 	return r;
 }
 
@@ -2149,7 +2155,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 
 	/* If no option could be set, test the specific tracer options */
 	if (!trace_options[i]) {
+		mutex_lock(&trace_types_lock);
 		ret = set_tracer_option(current_trace, cmp, neg);
+		mutex_unlock(&trace_types_lock);
 		if (ret)
 			return ret;
 	}
-- 
cgit v1.1


From 85a2f9b46f8cd8aaa11c64c715e1ea3ec27ec486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 00:12:38 -0500
Subject: tracing: use pointer error returns for __tracing_open

Impact: fix compile warning and clean up

When I first wrote __tracing_open, instead of passing the error
code via the ERR_PTR macros, I lazily used a separate parameter
to hold the return for errors.

When Frederic Weisbecker updated that function, he used the Linux
kernel ERR_PTR for the returns. This caused the parameter return
to possibly not be initialized on error. gcc correctly pointed this
out with a warning.

This patch converts the entire function to use the Linux kernel
ERR_PTR macro methods.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c89ec6..304e02c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1684,23 +1684,20 @@ static struct seq_operations tracer_seq_ops = {
 };
 
 static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file, int *ret)
+__tracing_open(struct inode *inode, struct file *file)
 {
 	long cpu_file = (long) inode->i_private;
+	void *fail_ret = ERR_PTR(-ENOMEM);
 	struct trace_iterator *iter;
 	struct seq_file *m;
-	int cpu;
+	int cpu, ret;
 
-	if (tracing_disabled) {
-		*ret = -ENODEV;
-		return NULL;
-	}
+	if (tracing_disabled)
+		return ERR_PTR(-ENODEV);
 
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter) {
-		*ret = -ENOMEM;
-		goto out;
-	}
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
@@ -1708,10 +1705,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	 */
 	mutex_lock(&trace_types_lock);
 	iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
-	if (!iter->trace) {
-		*ret = -ENOMEM;
+	if (!iter->trace)
 		goto fail;
-	}
+
 	if (current_trace)
 		*iter->trace = *current_trace;
 
@@ -1750,9 +1746,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	}
 
 	/* TODO stop tracer */
-	*ret = seq_open(file, &tracer_seq_ops);
-	if (*ret)
+	ret = seq_open(file, &tracer_seq_ops);
+	if (ret < 0) {
+		fail_ret = ERR_PTR(ret);
 		goto fail_buffer;
+	}
 
 	m = file->private_data;
 	m->private = iter;
@@ -1762,7 +1760,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 
 	mutex_unlock(&trace_types_lock);
 
- out:
 	return iter;
 
  fail_buffer:
@@ -1775,7 +1772,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	kfree(iter->trace);
 	kfree(iter);
 
-	return ERR_PTR(-ENOMEM);
+	return fail_ret;
 }
 
 int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1815,9 +1812,12 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 static int tracing_open(struct inode *inode, struct file *file)
 {
-	int ret;
+	struct trace_iterator *iter;
+	int ret = 0;
 
-	__tracing_open(inode, file, &ret);
+	iter = __tracing_open(inode, file);
+	if (IS_ERR(iter))
+		ret = PTR_ERR(iter);
 
 	return ret;
 }
@@ -1825,11 +1825,13 @@ static int tracing_open(struct inode *inode, struct file *file)
 static int tracing_lt_open(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter;
-	int ret;
+	int ret = 0;
 
-	iter = __tracing_open(inode, file, &ret);
+	iter = __tracing_open(inode, file);
 
-	if (!ret)
+	if (IS_ERR(iter))
+		ret = PTR_ERR(iter);
+	else
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	return ret;
-- 
cgit v1.1


From 5c6a3ae1b4beebb56e2916b84f1208d96a9e32ff Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 00:22:21 -0500
Subject: tracing: use newline separator for trace options list

Impact: clean up

Instead of listing the trace options like:

 # cat /debug/tracing/trace_options
print-parent nosym-offset nosym-addr noverbose noraw nohex nobin noblock nostacktrace nosched-tree ftrace_printk noftrace_preempt nobranch annotate nouserstacktrace nosym-userobj

We now list them like:

 # cat /debug/tracing/trace_options
print-parent
nosym-offset
nosym-addr
noverbose
noraw
nohex
nobin
noblock
nostacktrace
nosched-tree
ftrace_printk
noftrace_preempt
nobranch
annotate
nouserstacktrace
nosym-userobj

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 304e02c..5db7485 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2037,7 +2037,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
-		len += 3; /* "no" and space */
+		len += 3; /* "no" and newline */
 	}
 
 	mutex_lock(&trace_types_lock);
@@ -2050,7 +2050,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	 */
 	for (i = 0; trace_opts[i].name; i++) {
 		len += strlen(trace_opts[i].name);
-		len += 3; /* "no" and space */
+		len += 3; /* "no" and newline */
 	}
 
 	/* +2 for \n and \0 */
@@ -2062,22 +2062,21 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
-			r += sprintf(buf + r, "%s ", trace_options[i]);
+			r += sprintf(buf + r, "%s\n", trace_options[i]);
 		else
-			r += sprintf(buf + r, "no%s ", trace_options[i]);
+			r += sprintf(buf + r, "no%s\n", trace_options[i]);
 	}
 
 	for (i = 0; trace_opts[i].name; i++) {
 		if (tracer_flags & trace_opts[i].bit)
-			r += sprintf(buf + r, "%s ",
+			r += sprintf(buf + r, "%s\n",
 				trace_opts[i].name);
 		else
-			r += sprintf(buf + r, "no%s ",
+			r += sprintf(buf + r, "no%s\n",
 				trace_opts[i].name);
 	}
 	mutex_unlock(&trace_types_lock);
 
-	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-- 
cgit v1.1


From 0cfe82451dfa3ebf4e69158f2eb450f2fbb6b715 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 10:51:10 -0500
Subject: tracing: replace kzalloc with kcalloc

Impact: clean up

kcalloc is a better approach to allocate a NULL array.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5db7485..9c5987a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3328,7 +3328,7 @@ create_trace_option_files(struct tracer *tracer)
 	for (cnt = 0; opts[cnt].name; cnt++)
 		;
 
-	topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL);
+	topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
 	if (!topts)
 		return NULL;
 
-- 
cgit v1.1


From eb594e45f6979cd10b18d87f7b3f02119e00a108 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 17:36:06 -0500
Subject: tracing: move trace point formats to files in include/trace directory

Impact: clean up

To further facilitate the ease of adding trace points for developers, this
patch creates include/trace/trace_events.h and
include/trace/trace_event_types.h.

The former file will hold the trace/<type>.h files and the latter will hold
the trace/<type>_event_types.h files.

To create new tracepoints and to have them automatically
appear in the event tracer, a developer makes the trace/<type>.h file
which includes <linux/tracepoint.h> and the trace/<type>_event_types.h file.

The trace/<type>_event_types.h file will hold the TRACE_FORMAT
macros.

Then add the trace/<type>.h file to trace/trace_events.h,
and add the trace/<type>_event_types.h to the trace_event_types.h file.

No need to modify files elsewhere.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 3c75623..46e27ad 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -1,15 +1,9 @@
 /*
  * This is the place to register all trace points as events.
- * Include the trace/<type>.h at the top.
- * Include the trace/<type>_event_types.h at the bottom.
  */
 
-/* trace/<type>.h here */
-#include <trace/sched.h>
-#include <trace/irq.h>
+#include <trace/trace_events.h>
 
 #include "trace_events.h"
 
-/* trace/<type>_event_types.h here */
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
+#include <trace/trace_event_types.h>
-- 
cgit v1.1


From 6ecc2d1ca39177edb6fbdb7412948b0e9f409d02 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 21:33:02 -0500
Subject: tracing: add subsystem level to trace events

If a trace point header defines TRACE_SYSTEM, then it will add the
following trace points into that event system.

If include/trace/irq_event_types.h has:

 #define TRACE_SYSTEM irq

at the top and

 #undef TRACE_SYSTEM

at the bottom, then a directory "irq" will be created in the
/debug/tracing/events directory. Inside that directory will contain the
two trace points that are defined in include/trace/irq_event_types.h.

Only adding the above to irq and not to sched, we get:

 # ls /debug/tracing/events/
irq                     sched_process_exit  sched_signal_send  sched_wakeup_new
sched_kthread_stop      sched_process_fork  sched_switch
sched_kthread_stop_ret  sched_process_free  sched_wait_task
sched_migrate_task      sched_process_wait  sched_wakeup

 # ls /debug/tracing/events/irq
irq_handler_entry  irq_handler_exit

If we add #define TRACE_SYSTEM sched to the trace/sched_event_types.h
then the rest of the trace events will be put in a sched directory
within the events directory.

I've been playing with this idea of the subsystem for a while, but
recently Tom Zanussi posted some patches to lkml that included this
method. Tom's approach was clean and got me to finally put some effort
to clean up the event trace points.

Thanks to Tom Zanussi for demonstrating how nice the subsystem
method is.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c       |  4 ++++
 kernel/trace/trace_events.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.h |  2 ++
 3 files changed, 54 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 46e27ad..4e4e458 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -2,6 +2,10 @@
  * This is the place to register all trace points as events.
  */
 
+/* someday this needs to go in a generic header */
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
 #include <trace/trace_events.h>
 
 #include "trace_events.h"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3bcb9df..1933220 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -345,11 +345,59 @@ static struct dentry *event_trace_events_dir(void)
 	return d_events;
 }
 
+struct event_subsystem {
+	struct list_head	list;
+	const char		*name;
+	struct dentry		*entry;
+};
+
+static LIST_HEAD(event_subsystems);
+
+static struct dentry *
+event_subsystem_dir(const char *name, struct dentry *d_events)
+{
+	struct event_subsystem *system;
+
+	/* First see if we did not already create this dir */
+	list_for_each_entry(system, &event_subsystems, list) {
+		if (strcmp(system->name, name) == 0)
+			return system->entry;
+	}
+
+	/* need to create new entry */
+	system = kmalloc(sizeof(*system), GFP_KERNEL);
+	if (!system) {
+		pr_warning("No memory to create event subsystem %s\n",
+			   name);
+		return d_events;
+	}
+
+	system->entry = debugfs_create_dir(name, d_events);
+	if (!system->entry) {
+		pr_warning("Could not create event subsystem %s\n",
+			   name);
+		kfree(system);
+		return d_events;
+	}
+
+	system->name = name;
+	list_add(&system->list, &event_subsystems);
+
+	return system->entry;
+}
+
 static int
 event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 {
 	struct dentry *entry;
 
+	/*
+	 * If the trace point header did not define TRACE_SYSTEM
+	 * then the system would be called "TRACE_SYSTEM".
+	 */
+	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+		d_events = event_subsystem_dir(call->system, d_events);
+
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index deb95e5..b015d7b 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -7,6 +7,7 @@
 
 struct ftrace_event_call {
 	char		*name;
+	char		*system;
 	struct dentry	*dir;
 	int		enabled;
 	int		(*regfunc)(void);
@@ -44,6 +45,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
-- 
cgit v1.1


From b628b3e629b1436710e59a21cc020fbb04a52ce1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 23:32:58 -0500
Subject: tracing: make the set_event and available_events subsystem aware

This patch makes the event files, set_event and available_events
aware of the subsystem.

Now you can enable an entire subsystem with:

  echo 'irq:*' > set_event

Note: the '*' is not needed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1933220..b811eb3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,8 @@
 
 #include "trace_events.h"
 
+#define TRACE_SYSTEM "TRACE_SYSTEM"
+
 #define events_for_each(event)						\
 	for (event = __start_ftrace_events;				\
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
@@ -45,14 +47,47 @@ static void ftrace_clear_events(void)
 static int ftrace_set_clr_event(char *buf, int set)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
+	char *event = NULL, *sub = NULL, *match;
+	int ret = -EINVAL;
+
+	/*
+	 * The buf format can be <subsystem>:<event-name>
+	 *  *:<event-name> means any event by that name.
+	 *  :<event-name> is the same.
+	 *
+	 *  <subsystem>:* means all events in that subsystem
+	 *  <subsystem>: means the same.
+	 *
+	 *  <name> (no ':') means all events in a subsystem with
+	 *  the name <name> or any event that matches <name>
+	 */
+
+	match = strsep(&buf, ":");
+	if (buf) {
+		sub = match;
+		event = buf;
+		match = NULL;
 
+		if (!strlen(sub) || strcmp(sub, "*") == 0)
+			sub = NULL;
+		if (!strlen(event) || strcmp(event, "*") == 0)
+			event = NULL;
+	}
 
 	events_for_each(call) {
 
 		if (!call->name)
 			continue;
 
-		if (strcmp(buf, call->name) != 0)
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
 			continue;
 
 		if (set) {
@@ -68,9 +103,9 @@ static int ftrace_set_clr_event(char *buf, int set)
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		return 0;
+		ret = 0;
 	}
-	return -EINVAL;
+	return ret;
 }
 
 /* 128 should be much more than enough */
@@ -200,6 +235,8 @@ static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_event_call *call = v;
 
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
+		seq_printf(m, "%s:", call->system);
 	seq_printf(m, "%s\n", call->name);
 
 	return 0;
-- 
cgit v1.1


From ef5580d0fffce6e0a01043bac0625128b5d409a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 19:38:04 -0500
Subject: tracing: add interface to write into current tracer buffer

Right now all tracers must manage their own trace buffers. This was
to enforce tracers to be independent in case we finally decide to
allow each tracer to have their own trace buffer.

But now we are adding event tracing that writes to the current tracer's
buffer. This adds an interface to allow events to write to the current
tracer buffer without having to manage its own. Since event tracing
has no "tracer", and is just a way to hook into any other tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 14 ++++++++++++++
 kernel/trace/trace.h |  6 ++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9c5987a..c5e39cd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -846,6 +846,20 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	trace_wake_up();
 }
 
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc)
+{
+	return trace_buffer_lock_reserve(&global_trace,
+					 type, len, flags, pc);
+}
+
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
+}
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6321917..adf161f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -442,6 +442,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
-- 
cgit v1.1


From c32e827b25054cb17b79cf97fb5e63ae4ce2223c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 19:12:30 -0500
Subject: tracing: add raw trace point recording infrastructure

Impact: lower overhead tracing

The current event tracer can automatically pick up trace points
that are registered with the TRACE_FORMAT macro. But it required
a printf format string and parsing. Although, this adds the ability
to get guaranteed information like task names and such, it took
a hit in overhead processing. This processing can add about 500-1000
nanoseconds overhead, but in some cases that too is considered
too much and we want to shave off as much from this overhead as
possible.

Tom Zanussi recently posted tracing patches to lkml that are based
on a nice idea about capturing the data via C structs using
STRUCT_ENTER, STRUCT_EXIT type of macros.

I liked that method very much, but did not like the implementation
that required a developer to add data/code in several disjoint
locations.

This patch extends the event_tracer macros to do a similar "raw C"
approach that Tom Zanussi did. But instead of having the developers
needing to tweak a bunch of code all over the place, they can do it
all in one macro - preferably placed near the code that it is
tracing. That makes it much more likely that tracepoints will be
maintained on an ongoing basis by the code they modify.

The new macro TRACE_EVENT_FORMAT is created for this approach. (Note,
a developer may still utilize the more low level DECLARE_TRACE macros
if they don't care about getting their traces automatically in the event
tracer.)

They can also use the existing TRACE_FORMAT if they don't need to code
the tracepoint in C, but just want to use the convenience of printf.

So if the developer wants to "hardwire" a tracepoint in the fastest
possible way, and wants to acquire their data via a user space utility
in a raw binary format, or wants to see it in the trace output but not
sacrifice any performance, then they can implement the faster but
more complex TRACE_EVENT_FORMAT macro.

Here's what usage looks like:

  TRACE_EVENT_FORMAT(name,
	TPPROTO(proto),
	TPARGS(args),
	TPFMT(fmt, fmt_args),
	TRACE_STUCT(
		TRACE_FIELD(type1, item1, assign1)
		TRACE_FIELD(type2, item2, assign2)
			[...]
	),
	TPRAWFMT(raw_fmt)
	);

Note name, proto, args, and fmt, are all identical to what TRACE_FORMAT
uses.

 name: is the unique identifier of the trace point
 proto: The proto type that the trace point uses
 args: the args in the proto type
 fmt: printf format to use with the event printf tracer
 fmt_args: the printf argments to match fmt

 TRACE_STRUCT starts the ability to create a structure.
 Each item in the structure is defined with a TRACE_FIELD

  TRACE_FIELD(type, item, assign)

 type: the C type of item.
 item: the name of the item in the stucture
 assign: what to assign the item in the trace point callback

 raw_fmt is a way to pretty print the struct. It must match
  the order of the items are added in TRACE_STUCT

 An example of this would be:

 TRACE_EVENT_FORMAT(sched_wakeup,
	TPPROTO(struct rq *rq, struct task_struct *p, int success),
	TPARGS(rq, p, success),
	TPFMT("task %s:%d %s",
	      p->comm, p->pid, success?"succeeded":"failed"),
	TRACE_STRUCT(
		TRACE_FIELD(pid_t, pid, p->pid)
		TRACE_FIELD(int, success, success)
	),
	TPRAWFMT("task %d success=%d")
	);

 This creates us a unique struct of:

 struct {
	pid_t		pid;
	int		success;
 };

 And the way the call back would assign these values would be:

	entry->pid = p->pid;
	entry->success = success;

The nice part about this is that the creation of the assignent is done
via macro magic in the event tracer.  Once the TRACE_EVENT_FORMAT is
created, the developer will then have a faster method to record
into the ring buffer. They do not need to worry about the tracer itself.

The developer would only need to touch the files in include/trace/*.h

Again, I would like to give special thanks to Tom Zanussi for this
nice idea.

Idea-from: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c               |   6 +-
 kernel/trace/trace.h                |  19 ++++
 kernel/trace/trace_events.c         |   2 +-
 kernel/trace/trace_events.h         |  57 ----------
 kernel/trace/trace_events_stage_1.h |  34 ++++++
 kernel/trace/trace_events_stage_2.h |  72 ++++++++++++
 kernel/trace/trace_events_stage_3.h | 219 ++++++++++++++++++++++++++++++++++++
 7 files changed, 350 insertions(+), 59 deletions(-)
 delete mode 100644 kernel/trace/trace_events.h
 create mode 100644 kernel/trace/trace_events_stage_1.h
 create mode 100644 kernel/trace/trace_events_stage_2.h
 create mode 100644 kernel/trace/trace_events_stage_3.h

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 4e4e458..f2509cb 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,10 @@
 
 #include <trace/trace_events.h>
 
-#include "trace_events.h"
+#include "trace_output.h"
+
+#include "trace_events_stage_1.h"
+#include "trace_events_stage_2.h"
+#include "trace_events_stage_3.h"
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index adf161f..aa1ab0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -726,4 +726,23 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+struct ftrace_event_call {
+	char		*name;
+	char		*system;
+	struct dentry	*dir;
+	int		enabled;
+	int		(*regfunc)(void);
+	void		(*unregfunc)(void);
+	int		id;
+	struct dentry	*raw_dir;
+	int		raw_enabled;
+	int		(*raw_init)(void);
+	int		(*raw_reg)(void);
+	void		(*raw_unreg)(void);
+};
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...);
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b811eb3..77a5c02 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 
-#include "trace_events.h"
+#include "trace.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
deleted file mode 100644
index b015d7b..0000000
--- a/kernel/trace/trace_events.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _LINUX_KERNEL_TRACE_EVENTS_H
-#define _LINUX_KERNEL_TRACE_EVENTS_H
-
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include "trace.h"
-
-struct ftrace_event_call {
-	char		*name;
-	char		*system;
-	struct dentry	*dir;
-	int		enabled;
-	int		(*regfunc)(void);
-	void		(*unregfunc)(void);
-};
-
-
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (!ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-}
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
new file mode 100644
index 0000000..fd3bf93
--- /dev/null
+++ b/kernel/trace/trace_events_stage_1.h
@@ -0,0 +1,34 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *	struct trace_entry		ent;
+ *	<type>				<item>;
+ *	[...]
+ * };
+ *
+ * The <type> <item> is created by the TRACE_FIELD(type, item, assign)
+ * macro. We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
+	struct ftrace_raw_##name {					\
+		struct trace_entry	ent;				\
+		tstruct							\
+	};								\
+	static struct ftrace_event_call event_##name
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#define TRACE_FIELD(type, item, assign) \
+	type item;
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
new file mode 100644
index 0000000..3eaaef5
--- /dev/null
+++ b/kernel/trace/trace_events_stage_2.h
@@ -0,0 +1,72 @@
+/*
+ * Stage 2 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *	struct trace_seq *s = &iter->seq;
+ *	struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *	struct trace_entry *entry;
+ *	int ret;
+ *
+ *	entry = iter->ent;
+ *
+ *	if (entry->type != event_<call>.id) {
+ *		WARN_ON_ONCE(1);
+ *		return TRACE_TYPE_UNHANDLED;
+ *	}
+ *
+ *	field = (typeof(field))entry;
+ *
+ *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	if (!ret)
+ *		return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *	return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign) \
+	field->item,
+
+
+#undef TPRAWFMT
+#define TPRAWFMT(args...)	args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+enum print_line_t							\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##call *field;				\
+	struct trace_entry *entry;					\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
new file mode 100644
index 0000000..7a161c4
--- /dev/null
+++ b/kernel/trace/trace_events_stage_3.h
@@ -0,0 +1,219 @@
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ * 	event_trace_printk(_RET_IP_, "(<call>) " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ * 	int ret;
+ *
+ * 	ret = register_trace_<call>(ftrace_event_<call>);
+ * 	if (!ret)
+ * 		pr_info("event trace: Could not activate trace point "
+ * 			"probe to  <call>");
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ * 	unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ * For those macros defined with TRACE_FORMAT:
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * 	.name 			= "<call>",
+ * 	.regfunc		= ftrace_reg_event_<call>,
+ * 	.unregfunc		= ftrace_unreg_event_<call>,
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT_FORMAT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ * 	struct ring_buffer_event *event;
+ * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ * 	unsigned long irq_flags;
+ * 	int pc;
+ *
+ * 	local_save_flags(irq_flags);
+ * 	pc = preempt_count();
+ *
+ * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ * 				  sizeof(struct ftrace_raw_<call>),
+ * 				  irq_flags, pc);
+ * 	if (!event)
+ * 		return;
+ * 	entry	= ring_buffer_event_data(event);
+ *
+ * 	<tstruct>;  <-- Here we assign the entries by the TRACE_FIELD.
+ *
+ * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ * 	int ret;
+ *
+ * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ * 	if (!ret)
+ * 		pr_info("event trace: Could not activate trace point "
+ * 			"probe to <call>");
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ * 	int id;
+ *
+ * 	id = register_ftrace_event(&ftrace_event_type_<call>);
+ * 	if (!id)
+ * 		return -ENODEV;
+ * 	event_<call>.id = id;
+ * 	return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * 	.name 			= "<call>",
+ * 	.regfunc		= ftrace_reg_event_<call>,
+ * 	.unregfunc		= ftrace_unreg_event_<call>,
+ * 	.raw_init		= ftrace_raw_init_event_<call>,
+ * 	.raw_reg		= ftrace_raw_reg_event_<call>,
+ * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
+ * }
+ *
+ */
+
+#undef TPFMT
+#define TPFMT(fmt, args...)	fmt "\n", ##args
+
+#define _TRACE_FORMAT(call, proto, args, fmt)				\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+}
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static void ftrace_raw_event_##call(proto)				\
+{									\
+	struct ring_buffer_event *event;				\
+	struct ftrace_raw_##call *entry;				\
+	unsigned long irq_flags;					\
+	int pc;								\
+									\
+	local_save_flags(irq_flags);					\
+	pc = preempt_count();						\
+									\
+	event = trace_current_buffer_lock_reserve(event_##call.id,	\
+				  sizeof(struct ftrace_raw_##call), 	\
+				  irq_flags, pc);			\
+	if (!event)							\
+		return;							\
+	entry	= ring_buffer_event_data(event);			\
+									\
+	tstruct;							\
+									\
+	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+}									\
+									\
+static int ftrace_raw_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_raw_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_raw_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_raw_event_##call);		\
+}									\
+									\
+static struct trace_event ftrace_event_type_##call = {			\
+	.trace			= ftrace_raw_output_##call,		\
+};									\
+									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(&ftrace_event_type_##call);		\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.raw_reg		= ftrace_raw_reg_event_##call,		\
+	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+}
-- 
cgit v1.1


From fd99498989f3b3feeab89dcadf537138ba136d24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 28 Feb 2009 02:41:25 -0500
Subject: tracing: add raw fast tracing interface for trace events

This patch adds the interface to enable the C style trace points.
In the directory /debugfs/tracing/events/subsystem/event
We now have three files:

 enable : values 0 or 1 to enable or disable the trace event.

 available_types: values 'raw' and 'printf' which indicate the tracing
       types available for the trace point. If a developer does not
       use the TRACE_EVENT_FORMAT macro and just uses the TRACE_FORMAT
       macro, then only 'printf' will be available. This file is
       read only.

 type: values 'raw' or 'printf'. This indicates which type of tracing
       is active for that trace point. 'printf' is the default and
       if 'raw' is not available, this file is read only.

 # echo raw > /debug/tracing/events/sched/sched_wakeup/type
 # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable

 Will enable the C style tracing for the sched_wakeup trace point.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h        |   7 ++
 kernel/trace/trace_events.c | 199 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 181 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index aa1ab0c..f6fa0b9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -726,6 +726,12 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+/* trace event type bit fields, not numeric */
+enum {
+	TRACE_EVENT_TYPE_PRINTF		= 1,
+	TRACE_EVENT_TYPE_RAW		= 2,
+};
+
 struct ftrace_event_call {
 	char		*name;
 	char		*system;
@@ -736,6 +742,7 @@ struct ftrace_event_call {
 	int		id;
 	struct dentry	*raw_dir;
 	int		raw_enabled;
+	int		type;
 	int		(*raw_init)(void);
 	int		(*raw_reg)(void);
 	void		(*raw_unreg)(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 77a5c02..1d07f80 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -44,6 +44,36 @@ static void ftrace_clear_events(void)
 	}
 }
 
+static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+					int enable)
+{
+
+	switch (enable) {
+	case 0:
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		if (call->raw_enabled) {
+			call->raw_enabled = 0;
+			call->raw_unreg();
+		}
+		break;
+	case 1:
+		if (!call->enabled &&
+		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+			call->enabled = 1;
+			call->regfunc();
+		}
+		if (!call->raw_enabled &&
+		    (call->type & TRACE_EVENT_TYPE_RAW)) {
+			call->raw_enabled = 1;
+			call->raw_reg();
+		}
+		break;
+	}
+}
+
 static int ftrace_set_clr_event(char *buf, int set)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
@@ -90,19 +120,8 @@ static int ftrace_set_clr_event(char *buf, int set)
 		if (event && strcmp(event, call->name) != 0)
 			continue;
 
-		if (set) {
-			/* Already set? */
-			if (call->enabled)
-				return 0;
-			call->enabled = 1;
-			call->regfunc();
-		} else {
-			/* Already cleared? */
-			if (!call->enabled)
-				return 0;
-			call->enabled = 0;
-			call->unregfunc();
-		}
+		ftrace_event_enable_disable(call, set);
+
 		ret = 0;
 	}
 	return ret;
@@ -273,7 +292,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled)
+	if (call->enabled || call->raw_enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -304,18 +323,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	switch (val) {
 	case 0:
-		if (!call->enabled)
-			break;
-
-		call->enabled = 0;
-		call->unregfunc();
-		break;
 	case 1:
-		if (call->enabled)
-			break;
-
-		call->enabled = 1;
-		call->regfunc();
+		ftrace_event_enable_disable(call, val);
 		break;
 
 	default:
@@ -327,6 +336,107 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
+		loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[16];
+	int r = 0;
+
+	if (call->type & TRACE_EVENT_TYPE_PRINTF)
+		r += sprintf(buf, "printf\n");
+
+	if (call->type & TRACE_EVENT_TYPE_RAW)
+		r += sprintf(buf+r, "raw\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		 loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64];
+
+	/*
+	 * If there's only one type, we can't change it.
+	 * And currently we always have printf type, and we
+	 * may or may not have raw type.
+	 *
+	 * This is a redundant check, the file should be read
+	 * only if this is the case anyway.
+	 */
+
+	if (!call->raw_init)
+		return -EPERM;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (!strncmp(buf, "printf", 6) &&
+	    (!buf[6] || isspace(buf[6]))) {
+
+		call->type = TRACE_EVENT_TYPE_PRINTF;
+
+		/*
+		 * If raw enabled, the disable it and enable
+		 * printf type.
+		 */
+		if (call->raw_enabled) {
+			call->raw_enabled = 0;
+			call->raw_unreg();
+
+			call->enabled = 1;
+			call->regfunc();
+		}
+
+	} else if (!strncmp(buf, "raw", 3) &&
+	    (!buf[3] || isspace(buf[3]))) {
+
+		call->type = TRACE_EVENT_TYPE_RAW;
+
+		/*
+		 * If printf enabled, the disable it and enable
+		 * raw type.
+		 */
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+
+			call->raw_enabled = 1;
+			call->raw_reg();
+		}
+	} else
+		return -EINVAL;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
+			   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[16];
+	int r = 0;
+
+	r += sprintf(buf, "printf\n");
+
+	if (call->raw_init)
+		r += sprintf(buf+r, "raw\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -362,6 +472,17 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
+static const struct file_operations ftrace_type_fops = {
+	.open = tracing_open_generic,
+	.read = event_type_read,
+	.write = event_type_write,
+};
+
+static const struct file_operations ftrace_available_types_fops = {
+	.open = tracing_open_generic,
+	.read = event_available_types_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -427,6 +548,7 @@ static int
 event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 {
 	struct dentry *entry;
+	int ret;
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
@@ -435,6 +557,18 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
+	if (call->raw_init) {
+		ret = call->raw_init();
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
+		}
+	}
+
+	/* default the output to printf */
+	call->type = TRACE_EVENT_TYPE_PRINTF;
+
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -448,6 +582,21 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		pr_warning("Could not create debugfs "
 			   "'%s/enable' entry\n", call->name);
 
+	/* Only let type be writable, if we can change it */
+	entry = debugfs_create_file("type",
+				    call->raw_init ? 0644 : 0444,
+				    call->dir, call,
+				    &ftrace_type_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/type' entry\n", call->name);
+
+	entry = debugfs_create_file("available_types", 0444, call->dir, call,
+				    &ftrace_available_types_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/type' available_types\n", call->name);
+
 	return 0;
 }
 
-- 
cgit v1.1


From d20e3b03842bfeb9d21817ff19054c277cc3eac0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 10:53:15 -0500
Subject: tracing: add TRACE_FIELD_SPECIAL to record complex entries

Tom Zanussi pointed out that the simple TRACE_FIELD was not enough to
record trace data that required memcpy. This patch addresses this issue
by adding a TRACE_FIELD_SPECIAL. The format is similar to TRACE_FIELD
but looks like so:

  TRACE_FIELD_SPECIAL(type_item, item, cmd)

What TRACE_FIELD gave was:

  TRACE_FIELD(type, item, assign)

The TRACE_FIELD would be used in declaring a structure:

  struct {
	type	item;
  };

And later assign it via:

  entry->item = assign;

What TRACE_FIELD_SPECIAL gives us is:

In the declaration of the structure:

  struct {
	type_item;
  };

And the assignment:

  cmd;

This change log will explain the one example used in the patch:

 TRACE_EVENT_FORMAT(sched_switch,
	TPPROTO(struct rq *rq, struct task_struct *prev,
		struct task_struct *next),
	TPARGS(rq, prev, next),
	TPFMT("task %s:%d ==> %s:%d",
	      prev->comm, prev->pid, next->comm, next->pid),
	TRACE_STRUCT(
		TRACE_FIELD(pid_t, prev_pid, prev->pid)
		TRACE_FIELD(int, prev_prio, prev->prio)
		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
				    next_comm,
				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
						 next->comm,
						 TASK_COMM_LEN)))
		TRACE_FIELD(pid_t, next_pid, next->pid)
		TRACE_FIELD(int, next_prio, next->prio)
	),
	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
	);

 The struct will be create as:

  struct {
	pid_t		prev_pid;
	int		prev_prio;
	char next_comm[TASK_COMM_LEN];
	pid_t		next_pid;
	int		next_prio;
  };

Note the TRACE_ENTRY in the cmd part of TRACE_SPECIAL. TRACE_ENTRY will
be set by the tracer to point to the structure inside the trace buffer.

  entry->prev_pid	= prev->pid;
  entry->prev_prio	= prev->prio;
  memcpy(entry->next_comm, next->comm, TASK_COMM_LEN);
  entry->next_pid	= next->pid;
  entry->next_prio	= next->prio

Reported-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_1.h |  2 ++
 kernel/trace/trace_events_stage_2.h |  4 ++++
 kernel/trace/trace_events_stage_3.h | 14 ++++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index fd3bf93..3830a73 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -30,5 +30,7 @@
 
 #define TRACE_FIELD(type, item, assign) \
 	type item;
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	type_item;
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 3eaaef5..dc79fe3 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,6 +39,10 @@
 #define TRACE_FIELD(type, item, assign) \
 	field->item,
 
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	field->item,
+
 
 #undef TPRAWFMT
 #define TPRAWFMT(args...)	args
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 7a161c4..2ab65e9 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -147,6 +147,20 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TPCMD
+#define TPCMD(cmd...)	cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY	entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	cmd;
+
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-- 
cgit v1.1


From 11a241a3302277db05561e01477528629d806c4e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 11:49:04 -0500
Subject: tracing: add protection around modify trace event fields

The trace event objects are currently not proctected against
reentrancy. This patch adds a mutex around the modifications of
the trace event fields.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d07f80..26069fa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -14,6 +14,8 @@
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
+static DEFINE_MUTEX(event_mutex);
+
 #define events_for_each(event)						\
 	for (event = __start_ftrace_events;				\
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
@@ -104,6 +106,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
+	mutex_lock(&event_mutex);
 	events_for_each(call) {
 
 		if (!call->name)
@@ -124,6 +127,8 @@ static int ftrace_set_clr_event(char *buf, int set)
 
 		ret = 0;
 	}
+	mutex_unlock(&event_mutex);
+
 	return ret;
 }
 
@@ -324,7 +329,9 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	switch (val) {
 	case 0:
 	case 1:
+		mutex_lock(&event_mutex);
 		ftrace_event_enable_disable(call, val);
+		mutex_unlock(&event_mutex);
 		break;
 
 	default:
-- 
cgit v1.1


From f9520750c4c9924c14325cd951efae5fae58104c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 14:04:40 -0500
Subject: tracing: make trace_seq_reset global and rename to trace_seq_init

Impact: clean up

The trace_seq functions may be used separately outside of the ftrace
iterator. The trace_seq_reset is needed for these operations.

This patch also renames trace_seq_reset to the more appropriate
trace_seq_init.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 17 +++++------------
 kernel/trace/trace.h |  8 ++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c5e39cd..ea055aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -342,13 +342,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(tsk);
 }
 
-static void
-trace_seq_reset(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 {
 	int len;
@@ -395,7 +388,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	s->buffer[len] = 0;
 	seq_puts(m, s->buffer);
 
-	trace_seq_reset(s);
+	trace_seq_init(s);
 }
 
 /**
@@ -2620,7 +2613,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	if (sret != -EBUSY)
 		return sret;
 
-	trace_seq_reset(&iter->seq);
+	trace_seq_init(&iter->seq);
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
@@ -2682,7 +2675,7 @@ waitagain:
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
 	if (iter->seq.readpos >= iter->seq.len)
-		trace_seq_reset(&iter->seq);
+		trace_seq_init(&iter->seq);
 
 	/*
 	 * If there was nothing to send to user, inspite of consuming trace
@@ -2819,7 +2812,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		partial[i].offset = 0;
 		partial[i].len = iter->seq.len;
 
-		trace_seq_reset(&iter->seq);
+		trace_seq_init(&iter->seq);
 	}
 
 	mutex_unlock(&iter->mutex);
@@ -3631,7 +3624,7 @@ trace_printk_seq(struct trace_seq *s)
 
 	printk(KERN_TRACE "%s", s->buffer);
 
-	trace_seq_reset(s);
+	trace_seq_init(s);
 }
 
 void ftrace_dump(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f6fa0b9..cf6ba41 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,14 @@ struct trace_seq {
 	unsigned int		readpos;
 };
 
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+
 #define TRACE_PIPE_ALL_CPU	-1
 
 /*
-- 
cgit v1.1


From 981d081ec8b958b7d962ee40d433581a55d40fc5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 13:53:59 -0500
Subject: tracing: add format file to describe event struct fields

This patch adds the "format" file to the trace point event directory.
This is based off of work by Tom Zanussi, in which a file is exported
to be tread from user land such that a user space app may read the
binary record stored in the ring buffer.

 # cat /debug/tracing/events/sched/sched_switch/format
        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

Idea-from: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h                |  1 +
 kernel/trace/trace_events.c         | 56 ++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_events_stage_2.h | 52 ++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |  2 ++
 4 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cf6ba41..e606633 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -754,6 +754,7 @@ struct ftrace_event_call {
 	int		(*raw_init)(void);
 	int		(*raw_reg)(void);
 	void		(*raw_unreg)(void);
+	int		(*show_format)(struct trace_seq *s);
 };
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 26069fa..d57a772 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3,6 +3,9 @@
  *
  * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
  *
+ *  - Added format output of fields of the trace point.
+ *    This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
+ *
  */
 
 #include <linux/debugfs.h>
@@ -444,6 +447,42 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static ssize_t
+event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	char *buf;
+	int r;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	if (*ppos)
+		return 0;
+
+	r = call->show_format(s);
+	if (!r) {
+		/*
+		 * ug!  The format output is bigger than a PAGE!!
+		 */
+		buf = "FORMAT TOO BIG\n";
+		r = simple_read_from_buffer(ubuf, cnt, ppos,
+					      buf, strlen(buf));
+		goto out;
+	}
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+ out:
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -490,6 +529,11 @@ static const struct file_operations ftrace_available_types_fops = {
 	.read = event_available_types_read,
 };
 
+static const struct file_operations ftrace_event_format_fops = {
+	.open = tracing_open_generic,
+	.read = event_format_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -602,7 +646,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				    &ftrace_available_types_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-			   "'%s/type' available_types\n", call->name);
+			   "'%s/available_types' entry\n", call->name);
+
+	/* A trace may not want to export its format */
+	if (!call->show_format)
+		return 0;
+
+	entry = debugfs_create_file("format", 0444, call->dir, call,
+				    &ftrace_event_format_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/format' entry\n", call->name);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index dc79fe3..3a80ea4 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -74,3 +74,55 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 }
 
 #include <trace/trace_event_types.h>
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ * 	struct ftrace_raw_##call field;
+ * 	int ret;
+ *
+ * 	ret = trace_seq_printf(s, #type " " #item ";"
+ * 			       " size:%d; offset:%d;\n",
+ * 			       sizeof(field.type),
+ * 			       offsetof(struct ftrace_raw_##call,
+ * 					item));
+ *
+ * }
+ */
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2ab65e9..c62a4d2 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -101,6 +101,7 @@
  * 	.raw_init		= ftrace_raw_init_event_<call>,
  * 	.raw_reg		= ftrace_raw_reg_event_<call>,
  * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
+ *	.show_format		= ftrace_format_<call>,
  * }
  *
  */
@@ -230,4 +231,5 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.raw_reg		= ftrace_raw_reg_event_##call,		\
 	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.show_format		= ftrace_format_##call,			\
 }
-- 
cgit v1.1


From 91729ef96661bfa7dc53923746cd90b62d5495cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:03:01 -0500
Subject: tracing: add ftrace headers to event format files

This patch includes the ftrace header to the event formats files:

 # cat /debug/tracing/events/sched/sched_switch/format
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

A blank line is used as a deliminator between the ftrace header and the
trace point fields.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d57a772..cdcc3ae 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 
-#include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
@@ -447,6 +447,28 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+#undef FIELD
+#define FIELD(type, name) \
+	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+
+static int trace_write_header(struct trace_seq *s)
+{
+	struct trace_entry field;
+
+	/* struct trace_entry */
+	return trace_seq_printf(s,
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\n",
+				FIELD(unsigned char, type),
+				FIELD(unsigned char, flags),
+				FIELD(unsigned char, preempt_count),
+				FIELD(int, pid),
+				FIELD(int, tgid));
+}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -465,6 +487,9 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (*ppos)
 		return 0;
 
+	/* If this fails, so will the show_format. */
+	trace_write_header(s);
+
 	r = call->show_format(s);
 	if (!r) {
 		/*
-- 
cgit v1.1


From c5e4e19271edfdf1abd4184933d40d646da6a091 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:10:02 -0500
Subject: tracing: add trace name and id to event formats

To be able to identify the trace in the binary format output, the
id of the trace event (which is dynamically assigned) must also be listed.

This patch adds the name of the trace point as well as the id assigned.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cdcc3ae..210e71f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -487,7 +487,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (*ppos)
 		return 0;
 
-	/* If this fails, so will the show_format. */
+	/* If any of the first writes fail, so will the show_format. */
+
+	trace_seq_printf(s, "name: %s\n", call->name);
+	trace_seq_printf(s, "ID: %d\n", call->id);
+	trace_seq_printf(s, "format:\n");
 	trace_write_header(s);
 
 	r = call->show_format(s);
-- 
cgit v1.1


From 96ccd21cd13140221bda74a4fc4e53ffeba7c7d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:22:21 -0500
Subject: tracing: add print format to event trace format files

This patch adds the internal print format used to print the raw events
to the event trace point format file.

 # cat /debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 29
format:
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

print fmt: "prev %d:%d ==> next %s:%d:%d"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 3a80ea4..b1cebba 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -122,6 +122,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 									\
 	tstruct;							\
 									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
 	return ret;							\
 }
 
-- 
cgit v1.1


From 633ddaa7f471e9db181f993c1458d6f4bae321ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 09:43:50 -0500
Subject: tracing: fix return value to registering events

The registering of events had the return value check backwards.
A zero returned is success, the check had it as a failure.

This patch also fixes a missing "\n" in the warning that the check
failed.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_3.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index c62a4d2..041789f 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -120,9 +120,9 @@ static int ftrace_reg_event_##call(void)				\
 	int ret;							\
 									\
 	ret = register_trace_##call(ftrace_event_##call);		\
-	if (!ret)							\
+	if (ret)							\
 		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
+			"probe to " #call "\n");			\
 	return ret;							\
 }									\
 									\
@@ -195,9 +195,9 @@ static int ftrace_raw_reg_event_##call(void)				\
 	int ret;							\
 									\
 	ret = register_trace_##call(ftrace_raw_event_##call);		\
-	if (!ret)							\
+	if (ret)							\
 		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
+			"probe to " #call "\n");			\
 	return ret;							\
 }									\
 									\
-- 
cgit v1.1


From 41be4da4e85e58520b934040966a6ae919c66c2d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 20:56:48 -0500
Subject: ring-buffer: reset write field for ring_buffer_read_page

Impact: fix ring_buffer_read_page

After a page is swapped into the ring buffer, the write field must
also be reset.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a8c275c..9baad7e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2492,6 +2492,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		rb_init_page(bpage);
 		bpage = cpu_buffer->reader_page->page;
 		cpu_buffer->reader_page->page = *data_page;
+		local_set(&cpu_buffer->reader_page->write, 0);
 		cpu_buffer->reader_page->read = 0;
 		*data_page = bpage;
 	}
-- 
cgit v1.1


From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 00:27:49 -0500
Subject: ring-buffer: fix ring_buffer_read_page

The ring_buffer_read_page was broken if it were to only copy part
of the page. This patch fixes that up as well as adds a parameter
to allow a length field, in order to only copy part of the buffer page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 92 +++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9baad7e..2ad6bae 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+size_t ring_buffer_page_len(void *page)
+{
+	return local_read(&((struct buffer_data_page *)page)->commit);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
-	unsigned long addr;
 	struct buffer_data_page *bpage;
+	unsigned long addr;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
@@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	bpage = (void *)addr;
 
+	rb_init_page(bpage);
+
 	return bpage;
 }
 
@@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * ring_buffer_read_page - extract a page from the ring buffer
  * @buffer: buffer to extract from
  * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
  * @cpu: the cpu of the buffer to extract
  * @full: should the extraction only happen when the page is full.
  *
@@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
- *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
  *
@@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
-			    void **data_page, int cpu, int full)
+			  void **data_page, size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
+	struct buffer_page *reader;
 	unsigned long flags;
+	unsigned int commit;
 	unsigned int read;
 	int ret = -1;
 
 	if (!data_page)
-		return 0;
+		return -1;
 
 	bpage = *data_page;
 	if (!bpage)
-		return 0;
+		return -1;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-	/*
-	 * rb_buffer_peek will get the next ring buffer if
-	 * the current reader page is empty.
-	 */
-	event = rb_buffer_peek(buffer, cpu, NULL);
-	if (!event)
+	reader = rb_get_reader_page(cpu_buffer);
+	if (!reader)
 		goto out;
 
-	/* check for data */
-	if (!local_read(&cpu_buffer->reader_page->page->commit))
-		goto out;
+	event = rb_reader_event(cpu_buffer);
+
+	read = reader->read;
+	commit = rb_page_commit(reader);
 
-	read = cpu_buffer->reader_page->read;
 	/*
-	 * If the writer is already off of the read page, then simply
-	 * switch the read page with the given page. Otherwise
-	 * we need to copy the data from the reader to the writer.
+	 * If len > what's left on the page, and the writer is also off of
+	 * the read page, then simply switch the read page with the given
+	 * page. Otherwise we need to copy the data from the reader to the
+	 * writer.
 	 */
-	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+	if ((len < (commit - read)) ||
+	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+		unsigned int pos = read;
+		unsigned int size;
 
 		if (full)
 			goto out;
-		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
-		/* consume what was read */
-		cpu_buffer->reader_page->read = commit;
+		if (len > (commit - read))
+			len = (commit - read);
+
+		size = rb_event_length(event);
+
+		if (len < size)
+			goto out;
+
+		/* Need to copy one event at a time */
+		do {
+			memcpy(bpage->data + pos, rpage->data + pos, size);
+
+			len -= size;
+
+			rb_advance_reader(cpu_buffer);
+			pos = reader->read;
+
+			event = rb_reader_event(cpu_buffer);
+			size = rb_event_length(event);
+		} while (len > size);
 
 		/* update bpage */
-		local_set(&bpage->commit, commit);
-		if (!read)
-			bpage->time_stamp = rpage->time_stamp;
+		local_set(&bpage->commit, pos);
+		bpage->time_stamp = rpage->time_stamp;
+
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-		bpage = cpu_buffer->reader_page->page;
-		cpu_buffer->reader_page->page = *data_page;
-		local_set(&cpu_buffer->reader_page->write, 0);
-		cpu_buffer->reader_page->read = 0;
+		bpage = reader->page;
+		reader->page = *data_page;
+		local_set(&reader->write, 0);
+		reader->read = 0;
 		*data_page = bpage;
+
+		/* update the entry counter */
+		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.1


From e3d6bf0a0781a269f34250fd41e0d3dbfe540cf1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 13:53:07 -0500
Subject: ring-buffer: replace sizeof of event header with offsetof

Impact: fix to possible alignment problems on some archs.

Some arch compilers include an NULL char array in the sizeof field.
Since the ring_buffer_event type includes one of these, it is better
to use the "offsetof" instead, to avoid strange bugs on these archs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2ad6bae..27cf834 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -132,7 +132,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
-#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
 
-- 
cgit v1.1


From 474d32b68d6d842f3e710e9ae9fe2568c53339f8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 19:51:40 -0500
Subject: ring-buffer: make ring_buffer_read_page read from start on partial
 page

Impact: dont leave holes in read buffer page

The ring_buffer_read_page swaps a given page with the reader page
of the ring buffer, if certain conditions are set:

 1) requested length is big enough to hold entire page data

 2) a writer is not currently on the page

 3) the page is not partially consumed.

Instead of swapping with the supplied page. It copies the data to
the supplied page instead. But currently the data is copied in the
same offset as the source page. This causes a hole at the start
of the reader page. This complicates the use of this function.
Instead, it should copy the data at the beginning of the function
and update the index fields accordingly.

Other small clean ups are also done in this patch.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 27cf834..f2a163d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -61,6 +61,8 @@ enum {
 
 static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+
 /**
  * tracing_on - enable all tracing buffers
  *
@@ -234,9 +236,16 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+/**
+ * ring_buffer_page_len - the size of data on the page.
+ * @page: The page to read
+ *
+ * Returns the amount of data on the page, including buffer page header.
+ */
 size_t ring_buffer_page_len(void *page)
 {
-	return local_read(&((struct buffer_data_page *)page)->commit);
+	return local_read(&((struct buffer_data_page *)page)->commit)
+		+ BUF_PAGE_HDR_SIZE;
 }
 
 /*
@@ -259,7 +268,7 @@ static inline int test_time_stamp(u64 delta)
 	return 0;
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
+#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
 /*
  * head_page == tail_page && head == tail then buffer is empty.
@@ -2454,6 +2463,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned int read;
 	int ret = -1;
 
+	/*
+	 * If len is not big enough to hold the page header, then
+	 * we can not copy anything.
+	 */
+	if (len <= BUF_PAGE_HDR_SIZE)
+		return -1;
+
+	len -= BUF_PAGE_HDR_SIZE;
+
 	if (!data_page)
 		return -1;
 
@@ -2473,15 +2491,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	commit = rb_page_commit(reader);
 
 	/*
-	 * If len > what's left on the page, and the writer is also off of
-	 * the read page, then simply switch the read page with the given
-	 * page. Otherwise we need to copy the data from the reader to the
-	 * writer.
+	 * If this page has been partially read or
+	 * if len is not big enough to read the rest of the page or
+	 * a writer is still on the page, then
+	 * we must copy the data from the page to the buffer.
+	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
-	if ((len < (commit - read)) ||
+	if (read || (len < (commit - read)) ||
 	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
-		unsigned int pos = read;
+		unsigned int rpos = read;
+		unsigned int pos = 0;
 		unsigned int size;
 
 		if (full)
@@ -2497,12 +2517,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* Need to copy one event at a time */
 		do {
-			memcpy(bpage->data + pos, rpage->data + pos, size);
+			memcpy(bpage->data + pos, rpage->data + rpos, size);
 
 			len -= size;
 
 			rb_advance_reader(cpu_buffer);
-			pos = reader->read;
+			rpos = reader->read;
+			pos += size;
 
 			event = rb_reader_event(cpu_buffer);
 			size = rb_event_length(event);
@@ -2512,6 +2533,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&bpage->commit, pos);
 		bpage->time_stamp = rpage->time_stamp;
 
+		/* we copied everything to the beginning */
+		read = 0;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.1


From 2cadf9135eb3b6d84b6427314be827ddd443c308 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Dec 2008 22:20:19 -0500
Subject: tracing: add binary buffer files for use with splice

Impact: new feature

This patch creates a directory of files that correspond to the
per CPU ring buffers. These are binary files and are made to
be used with splice. This is the fastest way to extract data from
the ftrace ring buffers.

Thanks to Jiaying Zhang for pushing me to get this code fixed,
 and to Eduard - Gabriel Munteanu for his splice code that helped
 me debug my code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h |   1 +
 2 files changed, 268 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea055aa..12539f7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,31 +11,30 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
+#include <linux/ring_buffer.h>
 #include <linux/utsrelease.h>
+#include <linux/stacktrace.h>
+#include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
+#include <linux/irqflags.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
-#include <linux/kprobes.h>
-#include <linux/writeback.h>
-#include <linux/splice.h>
-
-#include <linux/stacktrace.h>
-#include <linux/ring_buffer.h>
-#include <linux/irqflags.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -3005,6 +3004,246 @@ static struct file_operations tracing_mark_fops = {
 	.write		= tracing_mark_write,
 };
 
+struct ftrace_buffer_info {
+	struct trace_array	*tr;
+	void			*spare;
+	int			cpu;
+	unsigned int		read;
+};
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp)
+{
+	int cpu = (int)(long)inode->i_private;
+	struct ftrace_buffer_info *info;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->tr	= &global_trace;
+	info->cpu	= cpu;
+	info->spare	= ring_buffer_alloc_read_page(info->tr->buffer);
+	/* Force reading ring buffer for first read */
+	info->read	= (unsigned int)-1;
+	if (!info->spare)
+		goto out;
+
+	filp->private_data = info;
+
+	return 0;
+
+ out:
+	kfree(info);
+	return -ENOMEM;
+}
+
+static ssize_t
+tracing_buffers_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	unsigned int pos;
+	ssize_t ret;
+	size_t size;
+
+	/* Do we have previous read data to read? */
+	if (info->read < PAGE_SIZE)
+		goto read;
+
+	info->read = 0;
+
+	ret = ring_buffer_read_page(info->tr->buffer,
+				    &info->spare,
+				    count,
+				    info->cpu, 0);
+	if (ret < 0)
+		return 0;
+
+	pos = ring_buffer_page_len(info->spare);
+
+	if (pos < PAGE_SIZE)
+		memset(info->spare + pos, 0, PAGE_SIZE - pos);
+
+read:
+	size = PAGE_SIZE - info->read;
+	if (size > count)
+		size = count;
+
+	ret = copy_to_user(ubuf, info->spare + info->read, size);
+	if (ret)
+		return -EFAULT;
+	*ppos += size;
+	info->read += size;
+
+	return size;
+}
+
+static int tracing_buffers_release(struct inode *inode, struct file *file)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+
+	ring_buffer_free_read_page(info->tr->buffer, info->spare);
+	kfree(info);
+
+	return 0;
+}
+
+struct buffer_ref {
+	struct ring_buffer	*buffer;
+	void			*page;
+	int			ref;
+};
+
+static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+	if (--ref->ref)
+		return;
+
+	ring_buffer_free_read_page(ref->buffer, ref->page);
+	kfree(ref);
+	buf->private = 0;
+}
+
+static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
+				 struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
+{
+	struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+	ref->ref++;
+}
+
+/* Pipe buffer operations for a buffer. */
+static struct pipe_buf_operations buffer_pipe_buf_ops = {
+	.can_merge		= 0,
+	.map			= generic_pipe_buf_map,
+	.unmap			= generic_pipe_buf_unmap,
+	.confirm		= generic_pipe_buf_confirm,
+	.release		= buffer_pipe_buf_release,
+	.steal			= buffer_pipe_buf_steal,
+	.get			= buffer_pipe_buf_get,
+};
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+	struct buffer_ref *ref =
+		(struct buffer_ref *)spd->partial[i].private;
+
+	if (--ref->ref)
+		return;
+
+	ring_buffer_free_read_page(ref->buffer, ref->page);
+	kfree(ref);
+	spd->partial[i].private = 0;
+}
+
+static ssize_t
+tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+			    struct pipe_inode_info *pipe, size_t len,
+			    unsigned int flags)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages		= pages,
+		.partial	= partial,
+		.flags		= flags,
+		.ops		= &buffer_pipe_buf_ops,
+		.spd_release	= buffer_spd_release,
+	};
+	struct buffer_ref *ref;
+	int size, i;
+	size_t ret;
+
+	/*
+	 * We can't seek on a buffer input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+
+	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+		struct page *page;
+		int r;
+
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			break;
+
+		ref->buffer = info->tr->buffer;
+		ref->page = ring_buffer_alloc_read_page(ref->buffer);
+		if (!ref->page) {
+			kfree(ref);
+			break;
+		}
+
+		r = ring_buffer_read_page(ref->buffer, &ref->page,
+					  len, info->cpu, 0);
+		if (r < 0) {
+			ring_buffer_free_read_page(ref->buffer,
+						   ref->page);
+			kfree(ref);
+			break;
+		}
+
+		/*
+		 * zero out any left over data, this is going to
+		 * user land.
+		 */
+		size = ring_buffer_page_len(ref->page);
+		if (size < PAGE_SIZE)
+			memset(ref->page + size, 0, PAGE_SIZE - size);
+
+		page = virt_to_page(ref->page);
+
+		spd.pages[i] = page;
+		spd.partial[i].len = PAGE_SIZE;
+		spd.partial[i].offset = 0;
+		spd.partial[i].private = (unsigned long)ref;
+		spd.nr_pages++;
+	}
+
+	spd.nr_pages = i;
+
+	/* did we read anything? */
+	if (!spd.nr_pages) {
+		if (flags & SPLICE_F_NONBLOCK)
+			ret = -EAGAIN;
+		else
+			ret = 0;
+		/* TODO: block */
+		return ret;
+	}
+
+	ret = splice_to_pipe(pipe, &spd);
+
+	return ret;
+}
+
+static const struct file_operations tracing_buffers_fops = {
+	.open		= tracing_buffers_open,
+	.read		= tracing_buffers_read,
+	.release	= tracing_buffers_release,
+	.splice_read	= tracing_buffers_splice_read,
+	.llseek		= no_llseek,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3399,6 +3638,7 @@ static __init void create_trace_options_dir(void)
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
+	struct dentry *buffers;
 	struct dentry *entry;
 	int cpu;
 
@@ -3471,6 +3711,26 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_marker' entry\n");
 
+	buffers = debugfs_create_dir("binary_buffers", d_tracer);
+
+	if (!buffers)
+		pr_warning("Could not create buffers directory\n");
+	else {
+		int cpu;
+		char buf[64];
+
+		for_each_tracing_cpu(cpu) {
+			sprintf(buf, "%d", cpu);
+
+			entry = debugfs_create_file(buf, 0444, buffers,
+						    (void *)(long)cpu,
+						    &tracing_buffers_fops);
+			if (!entry)
+				pr_warning("Could not create debugfs buffers "
+					   "'%s' entry\n", buf);
+		}
+	}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e606633..561bb5c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -217,6 +217,7 @@ enum trace_flag_type {
  */
 struct trace_array_cpu {
 	atomic_t		disabled;
+	void			*buffer_page;	/* ring buffer spare */
 
 	/* these fields get copied into max-trace: */
 	unsigned long		trace_idx;
-- 
cgit v1.1


From 1c21f14ec48a2256fb03682b24dddd23eacdc96f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 4 Mar 2009 13:51:13 +0100
Subject: lockdep: fix incorrect state name

In the recent mark_lock_irq() rework a bug snuck in that would report the
state of write locks causing irq inversion under a read lock as a read
lock.

Fix this by masking the read bit of the state when validating write
dependencies.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236172646.5330.7450.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 022d2ed..ef6584f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2015,7 +2015,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
-mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit new_bit)
 {
 	int excl_bit = exclusive_bit(new_bit);
 	int read = new_bit & 1;
@@ -2043,7 +2044,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	 * states.
 	 */
 	if ((!read || !dir || STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit, state_name(new_bit)))
+			!usage(curr, this, excl_bit, state_name(new_bit & ~1)))
 		return 0;
 
 	/*
-- 
cgit v1.1


From 26575e28df5eb2050c02369843faba38cecb4d8c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 4 Mar 2009 14:53:24 +0100
Subject: lockdep: remove extra "irq" string

Impact: clarify lockdep printk text

print_irq_inversion_bug() gets handed state strings of the form

  "HARDIRQ", "SOFTIRQ", "RECLAIM_FS"

and appends "-irq-{un,}safe" to them, which is either redudant for *IRQ or
confusing in the RECLAIM_FS case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236175192.5330.7585.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ef6584f..02014f7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1900,9 +1900,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 		curr->comm, task_pid_nr(curr));
 	print_lock(this);
 	if (forwards)
-		printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
 	else
-		printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
 	print_lock_name(other);
 	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
 
-- 
cgit v1.1


From efed792d6738964f399a508ef9e831cd60fa4657 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 4 Mar 2009 12:32:55 +0100
Subject: tracing: add lockdep tracepoints for lock acquire/release

Augment the traces with lock names when lockdep is available:

 1)               |  down_read_trylock() {
 1)               |    _spin_lock_irqsave() {
 1)               |      /* lock_acquire: &sem->wait_lock */
 1)   4.201 us    |    }
 1)               |    _spin_unlock_irqrestore() {
 1)               |      /* lock_release: &sem->wait_lock */
 1)   3.523 us    |    }
 1)               |  /* lock_acquire: try read &mm->mmap_sem */
 1) + 13.386 us   |  }
 1)   1.635 us    |  find_vma();
 1)               |  handle_mm_fault() {
 1)               |    __do_fault() {
 1)               |      filemap_fault() {
 1)               |        find_lock_page() {
 1)               |          find_get_page() {
 1)               |            /* lock_acquire: read rcu_read_lock */
 1)               |            /* lock_release: rcu_read_lock */
 1)   5.697 us    |          }
 1)   8.158 us    |        }
 1) + 11.079 us   |      }
 1)               |      _spin_lock() {
 1)               |        /* lock_acquire: __pte_lockptr(page) */
 1)   3.949 us    |      }
 1)   1.460 us    |      page_add_file_rmap();
 1)               |      _spin_unlock() {
 1)               |        /* lock_release: __pte_lockptr(page) */
 1)   3.115 us    |      }
 1)               |      unlock_page() {
 1)   1.421 us    |        page_waitqueue();
 1)   1.220 us    |        __wake_up_bit();
 1)   6.519 us    |      }
 1) + 34.328 us   |    }
 1) + 37.452 us   |  }
 1)               |  up_read() {
 1)               |  /* lock_release: &mm->mmap_sem */
 1)               |    _spin_lock_irqsave() {
 1)               |      /* lock_acquire: &sem->wait_lock */
 1)   3.865 us    |    }
 1)               |    _spin_unlock_irqrestore() {
 1)               |      /* lock_release: &sem->wait_lock */
 1)   8.562 us    |    }
 1) + 17.370 us   |  }

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?T=F6r=F6k?= Edwin <edwintorok@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236166375.5330.7209.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c                    | 17 +++++++++++++++++
 kernel/trace/trace.c                | 14 ++++++++------
 kernel/trace/trace_events_stage_3.h |  4 ++--
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02014f7..cb70c1d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
+#include <trace/lockdep.h>
 
 #include <asm/sections.h>
 
@@ -2913,6 +2914,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
+DEFINE_TRACE(lock_acquire);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2923,6 +2926,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
+	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2937,11 +2942,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
+DEFINE_TRACE(lock_release);
+
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_release(lock, nested, ip);
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -3090,10 +3099,14 @@ found_it:
 	lock->ip = ip;
 }
 
+DEFINE_TRACE(lock_contended);
+
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_contended(lock, ip);
+
 	if (unlikely(!lock_stat))
 		return;
 
@@ -3109,10 +3122,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
+DEFINE_TRACE(lock_acquired);
+
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_acquired(lock, ip);
+
 	if (unlikely(!lock_stat))
 		return;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 12539f7..c8abbb0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -623,7 +623,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
-static DEFINE_SPINLOCK(trace_cmdline_lock);
+static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
 
 /* temporary disable recording */
 static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -735,7 +735,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	 * nor do we want to disable interrupts,
 	 * so if we miss here, then better luck next time.
 	 */
-	if (!spin_trylock(&trace_cmdline_lock))
+	if (!__raw_spin_trylock(&trace_cmdline_lock))
 		return;
 
 	idx = map_pid_to_cmdline[tsk->pid];
@@ -753,7 +753,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 
 	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
 
-	spin_unlock(&trace_cmdline_lock);
+	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
 char *trace_find_cmdline(int pid)
@@ -3751,7 +3751,7 @@ static __init int tracer_init_debugfs(void)
 
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3773,7 +3773,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out;
 
 	pause_graph_tracing();
-	spin_lock_irqsave(&trace_buf_lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3792,7 +3793,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
  out:
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 041789f..2c8d76c 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,7 +5,7 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "(<call>) " <fmt>);
+ * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
@@ -112,7 +112,7 @@
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
-	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+	event_trace_printk(_RET_IP_, #call ": " fmt);			\
 }									\
 									\
 static int ftrace_reg_event_##call(void)				\
-- 
cgit v1.1


From e543ad76914abec1acf6631604a4154cd7a2ca6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 18:20:36 -0500
Subject: tracing: add cpu_file intialization for ftrace_dump

Impact: fix to ftrace_dump output corruption

The commit: b04cc6b1f6398b0e0b60d37e27ce51b4899672ec
  tracing/core: introduce per cpu tracing files

added a new field to the iterator called cpu_file. This was a handle
to differentiate between the per cpu trace output files and the
all cpu "trace" file. The all cpu "trace" file required setting this
to TRACE_PIPE_ALL_CPU.

The problem is that the ftrace_dump sets up its own iterator but was
not updated to handle this change. The result was only CPU 0 printing
out on crash and a lot of "<0>"'s also being printed.

Reported-by: Thomas Gleixner <tglx@linuxtronix.de>
Tested-by: Darren Hart <dvhtc@us.ibm.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8abbb0..ab5cbca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3918,8 +3918,10 @@ void ftrace_dump(void)
 
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
+	/* Simulate the iterator */
 	iter.tr = &global_trace;
 	iter.trace = current_trace;
+	iter.cpu_file = TRACE_PIPE_ALL_CPU;
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
-- 
cgit v1.1


From 4f3640f8a358f2183a8c966f299eeb55ca523e06 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 23:52:42 -0500
Subject: ring-buffer: fix timestamp in partial ring_buffer_page_read

If a partial ring_buffer_page_read happens, then some of the
incremental timestamps may be lost. This patch writes the
recent timestamp into the page that is passed back to the caller.

A partial ring_buffer_page_read is where the full page would not
be written back to the user, and instead, just part of the page
is copied to the user. A full page would be a page swap with the
ring buffer and the timestamps would be correct.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f2a163d..f747364 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2461,6 +2461,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
+	u64 save_timestamp;
 	int ret = -1;
 
 	/*
@@ -2515,6 +2516,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		if (len < size)
 			goto out;
 
+		/* save the current timestamp, since the user will need it */
+		save_timestamp = cpu_buffer->read_stamp;
+
 		/* Need to copy one event at a time */
 		do {
 			memcpy(bpage->data + pos, rpage->data + rpos, size);
@@ -2531,7 +2535,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* update bpage */
 		local_set(&bpage->commit, pos);
-		bpage->time_stamp = rpage->time_stamp;
+		bpage->time_stamp = save_timestamp;
 
 		/* we copied everything to the beginning */
 		read = 0;
-- 
cgit v1.1


From 2dc5d12b1f43134e9bc5037f69f4739cfdfab93e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 19:10:05 -0500
Subject: tracing: do not return EFAULT if read copied anything

Impact: fix trace read to conform to standards

Andrew Morton, Theodore Tso and H. Peter Anvin brought to my attention
that a userspace read should not return -EFAULT if it succeeded in
copying anything. It should only return -EFAULT if it failed to copy
at all.

This patch modifies the check of copy_from_user and updates the return
code appropriately.

I also used H. Peter Anvin's short cut rule to just test ret == count.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ab5cbca..57155dc53 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -346,6 +346,9 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	int len;
 	int ret;
 
+	if (!cnt)
+		return 0;
+
 	if (s->len <= s->readpos)
 		return -EBUSY;
 
@@ -353,9 +356,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	if (cnt > len)
 		cnt = len;
 	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
-	if (ret)
+	if (ret == cnt)
 		return -EFAULT;
 
+	cnt -= ret;
+
 	s->readpos += len;
 	return cnt;
 }
@@ -3049,6 +3054,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	ssize_t ret;
 	size_t size;
 
+	if (!count)
+		return 0;
+
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
 		goto read;
@@ -3073,8 +3081,10 @@ read:
 		size = count;
 
 	ret = copy_to_user(ubuf, info->spare + info->read, size);
-	if (ret)
+	if (ret == size)
 		return -EFAULT;
+	size -= ret;
+
 	*ppos += size;
 	info->read += size;
 
-- 
cgit v1.1


From e74da5235cec6cb71eb338c987f876ecc793138b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 20:31:11 -0500
Subject: tracing: fix seq read from trace files

The buffer used by trace_seq was updated incorrectly. Instead
of consuming what was actually read, it consumed the rest of the
buffer on reads.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 57155dc53..2e53e6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -361,7 +361,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 
 	cnt -= ret;
 
-	s->readpos += len;
+	s->readpos += cnt;
 	return cnt;
 }
 
@@ -380,7 +380,7 @@ ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	if (!ret)
 		return -EFAULT;
 
-	s->readpos += len;
+	s->readpos += cnt;
 	return cnt;
 }
 
-- 
cgit v1.1


From c032ef64d680717e4e8ce3da65da6419a35f8a2c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 20:34:24 -0500
Subject: tracing: add latency output format option

With the removal of the latency_trace file, we lost the ability
to see some of the finer details in a trace. Like the state of
interrupts enabled, the preempt count, need resched, and if we
are in an interrupt handler, softirq handler or not.

This patch simply creates an option to bring back the old format.
This also removes the warning about an unused variable that held
the latency_trace file operations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 24 ++----------------------
 kernel/trace/trace.h |  3 ++-
 2 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2e53e6f..55fcbb5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -299,6 +299,7 @@ static const char *trace_options[] = {
 	"sym-userobj",
 	"printk-msg-only",
 	"context-info",
+	"latency-format",
 	NULL
 };
 
@@ -1829,26 +1830,12 @@ static int tracing_open(struct inode *inode, struct file *file)
 	iter = __tracing_open(inode, file);
 	if (IS_ERR(iter))
 		ret = PTR_ERR(iter);
-
-	return ret;
-}
-
-static int tracing_lt_open(struct inode *inode, struct file *file)
-{
-	struct trace_iterator *iter;
-	int ret = 0;
-
-	iter = __tracing_open(inode, file);
-
-	if (IS_ERR(iter))
-		ret = PTR_ERR(iter);
-	else
+	else if (trace_flags & TRACE_ITER_LATENCY_FMT)
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	return ret;
 }
 
-
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
@@ -1927,13 +1914,6 @@ static struct file_operations tracing_fops = {
 	.release	= tracing_release,
 };
 
-static struct file_operations tracing_lt_fops = {
-	.open		= tracing_lt_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= tracing_release,
-};
-
 static struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 561bb5c..12cd119 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -651,7 +651,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
 	TRACE_ITER_SYM_USEROBJ          = 0x8000,
 	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
-	TRACE_ITER_CONTEXT_INFO		= 0x20000 /* Print pid/cpu/time */
+	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
+	TRACE_ITER_LATENCY_FMT		= 0x40000,
 };
 
 /*
-- 
cgit v1.1


From 5fd73f862468280d4cbb5ba4321502f911f9f89a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 21:42:04 -0500
Subject: tracing: remove extra latency_trace method from trace structure

Impact: clean up

The trace and latency_trace function pointers are identical for
every tracer but the function tracer. The differences in the function
tracer are trivial (latency output puts paranthesis around parent).

This patch removes the latency_trace pointer and all prints will
now just use the trace output function pointer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c     |  1 -
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace_branch.c |  1 -
 kernel/trace/trace_output.c | 32 --------------------------------
 kernel/trace/trace_output.h |  1 -
 5 files changed, 1 insertion(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e82cb9e..e39679a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1231,7 +1231,6 @@ static struct tracer blk_tracer __read_mostly = {
 static struct trace_event trace_blk_event = {
 	.type	 	= TRACE_BLK,
 	.trace		= blk_trace_event_print,
-	.latency_trace	= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55fcbb5..21b89ec 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1485,7 +1485,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	}
 
 	if (event)
-		return event->latency_trace(iter, sym_flags);
+		return event->trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
 		goto partial;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c2e68d4..aaa0755 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -159,7 +159,6 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
-	.latency_trace	= trace_branch_print,
 };
 
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9fc8150..306fef8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -437,8 +437,6 @@ int register_ftrace_event(struct trace_event *event)
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
-	if (event->latency_trace == NULL)
-		event->latency_trace = trace_nop_print;
 	if (event->raw == NULL)
 		event->raw = trace_nop_print;
 	if (event->hex == NULL)
@@ -480,29 +478,6 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
 }
 
 /* TRACE_FN */
-static enum print_line_t trace_fn_latency(struct trace_iterator *iter,
-					  int flags)
-{
-	struct ftrace_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-	if (!trace_seq_puts(s, " ("))
-		goto partial;
-	if (!seq_print_ip_sym(s, field->parent_ip, flags))
-		goto partial;
-	if (!trace_seq_puts(s, ")\n"))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
@@ -573,7 +548,6 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 static struct trace_event trace_fn_event = {
 	.type	 	= TRACE_FN,
 	.trace		= trace_fn_trace,
-	.latency_trace	= trace_fn_latency,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
 	.binary		= trace_fn_bin,
@@ -705,7 +679,6 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 static struct trace_event trace_ctx_event = {
 	.type	 	= TRACE_CTX,
 	.trace		= trace_ctx_print,
-	.latency_trace	= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
 	.binary		= trace_ctxwake_bin,
@@ -714,7 +687,6 @@ static struct trace_event trace_ctx_event = {
 static struct trace_event trace_wake_event = {
 	.type	 	= TRACE_WAKE,
 	.trace		= trace_wake_print,
-	.latency_trace	= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
 	.binary		= trace_ctxwake_bin,
@@ -770,7 +742,6 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 static struct trace_event trace_special_event = {
 	.type	 	= TRACE_SPECIAL,
 	.trace		= trace_special_print,
-	.latency_trace	= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -808,7 +779,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 static struct trace_event trace_stack_event = {
 	.type	 	= TRACE_STACK,
 	.trace		= trace_stack_print,
-	.latency_trace	= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -838,7 +808,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 static struct trace_event trace_user_stack_event = {
 	.type	 	= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
-	.latency_trace	= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -883,7 +852,6 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 static struct trace_event trace_print_event = {
 	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
-	.latency_trace	= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 551a25a..8a34d68 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -10,7 +10,6 @@ struct trace_event {
 	struct hlist_node	node;
 	int			type;
 	trace_print_func	trace;
-	trace_print_func	latency_trace;
 	trace_print_func	raw;
 	trace_print_func	hex;
 	trace_print_func	binary;
-- 
cgit v1.1


From 27d48be84477d2f0a2e2ac3738a3971dece631d5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 21:57:29 -0500
Subject: tracing: consolidate print_lat_fmt and print_trace_fmt

Impact: clean up

Both print_lat_fmt and print_trace_fmt do pretty much the same thing
except for one different function call. This patch consolidates the
two functions and adds an if statement to perform the difference.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 39 +++++++--------------------------------
 1 file changed, 7 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21b89ec..d1ef439 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1468,33 +1468,6 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
-static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_event *event;
-	struct trace_entry *entry = iter->ent;
-
-	test_cpu_buff_start(iter);
-
-	event = ftrace_find_event(entry->type);
-
-	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		if (!trace_print_lat_context(iter))
-			goto partial;
-	}
-
-	if (event)
-		return event->trace(iter, sym_flags);
-
-	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1509,8 +1482,13 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		if (!trace_print_context(iter))
-			goto partial;
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			if (!trace_print_lat_context(iter))
+				goto partial;
+		} else {
+			if (!trace_print_context(iter))
+				goto partial;
+		}
 	}
 
 	if (event)
@@ -1652,9 +1630,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	if (trace_flags & TRACE_ITER_RAW)
 		return print_raw_fmt(iter);
 
-	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-		return print_lat_fmt(iter);
-
 	return print_trace_fmt(iter);
 }
 
-- 
cgit v1.1


From e9d25fe6eaa2c720bb3ea661b660e58d54fa38bf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 22:15:30 -0500
Subject: tracing: have latency tracers set the latency format

The latency tracers (irqsoff, preemptoff, preemptirqsoff, and wakeup)
are pretty useless with the default output format. This patch makes them
automatically enable the latency format when they are selected. They
also record the state of the latency option, and if it was not enabled
when selected, they disable it on reset.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_irqsoff.c      | 8 ++++++++
 kernel/trace/trace_sched_wakeup.c | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9e5ebd8..b923d13 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,6 +32,8 @@ enum {
 
 static int trace_type __read_mostly;
 
+static int save_lat_flag;
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -370,6 +372,9 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+	trace_flags |= TRACE_ITER_LATENCY_FMT;
+
 	tracing_max_latency = 0;
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
@@ -380,6 +385,9 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
 	stop_irqsoff_tracer(tr);
+
+	if (!save_lat_flag)
+		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index db55f7a..3c5ad6b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -32,6 +32,8 @@ static raw_spinlock_t wakeup_lock =
 
 static void __wakeup_reset(struct trace_array *tr);
 
+static int save_lat_flag;
+
 #ifdef CONFIG_FUNCTION_TRACER
 /*
  * irqsoff uses its own tracer function to keep the overhead down:
@@ -324,6 +326,9 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
+	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+	trace_flags |= TRACE_ITER_LATENCY_FMT;
+
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
@@ -347,6 +352,9 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 	stop_wakeup_tracer(tr);
 	/* make sure we put back any tasks we are tracing */
 	wakeup_reset(tr);
+
+	if (!save_lat_flag)
+		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
-- 
cgit v1.1


From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:24:48 +0100
Subject: tracing: rename ftrace_printk() => trace_printk()

Impact: cleanup

Use a more generic name - this also allows the prototype to move
to kernel.h and be generally available to kernel developers who
want to do some quick tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 ++++----
 kernel/trace/trace.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1ef439..c0e9c12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -48,7 +48,7 @@ unsigned long __read_mostly	tracing_thresh;
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
- * insertions into the ring-buffer such as ftrace_printk could occurred
+ * insertions into the ring-buffer such as trace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
 static bool __read_mostly tracing_selftest_running;
@@ -291,7 +291,7 @@ static const char *trace_options[] = {
 	"block",
 	"stacktrace",
 	"sched-tree",
-	"ftrace_printk",
+	"trace_printk",
 	"ftrace_preempt",
 	"branch",
 	"annotate",
@@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	int ret;
 	va_list ap;
@@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ftrace_printk);
+EXPORT_SYMBOL_GPL(__trace_printk);
 
 int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 12cd119..8beff03 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -115,7 +115,7 @@ struct userstack_entry {
 };
 
 /*
- * ftrace_printk entry:
+ * trace_printk entry:
  */
 struct print_entry {
 	struct trace_entry	ent;
-- 
cgit v1.1


From 03d78913f01e8f6599823f00357ed17b32747d3d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Mar 2009 02:29:05 -0800
Subject: lockdep: remove duplicate CONFIG_DEBUG_LOCKDEP definitions

Impact: cleanup

The atomic debug modifiers are already defined in
kernel/lockdep_internals.h.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <alpine.DEB.2.00.0903050222160.30401@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02014f7..9a1e2bc 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks;
 atomic_t nr_find_usage_forwards_recursions;
 atomic_t nr_find_usage_backwards_checks;
 atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr)		atomic_inc(ptr)
-# define debug_atomic_dec(ptr)		atomic_dec(ptr)
-# define debug_atomic_read(ptr)		atomic_read(ptr)
-#else
-# define debug_atomic_inc(ptr)		do { } while (0)
-# define debug_atomic_dec(ptr)		do { } while (0)
-# define debug_atomic_read(ptr)		0
 #endif
 
 /*
-- 
cgit v1.1


From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Mar 2009 01:49:22 +0100
Subject: tracing/function-graph-tracer: use the more lightweight local clock

Impact: decrease hangs risks with the graph tracer on slow systems

Since the function graph tracer can spend too much time on timer
interrupts, it's better now to use the more lightweight local
clock. Anyway, the function graph traces are more reliable on a
per cpu trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c009553..e527f2f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void)
 	unsigned long ret;
 
 	ftrace_pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
-- 
cgit v1.1


From 40ada30f9621fbd831ac2437b9a2a399aad34b00 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 21:19:55 +0100
Subject: tracing: clean up menu

Clean up menu structure, introduce TRACING_SUPPORT switch that signals
whether an architecture supports various instrumentation mechanisms.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 999c6a2..5d733da 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -53,12 +53,22 @@ config TRACING
 	select TRACEPOINTS
 	select NOP_TRACER
 
+#
+# Minimum requirements an architecture has to meet for us to
+# be able to offer generic tracing facilities:
+#
+config TRACING_SUPPORT
+	bool
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on STACKTRACE_SUPPORT
+
+if TRACING_SUPPORT
+
 menu "Tracers"
 
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
 	depends on HAVE_FUNCTION_TRACER
-	depends on DEBUG_KERNEL
 	select FRAME_POINTER
 	select KALLSYMS
 	select TRACING
@@ -91,7 +101,6 @@ config IRQSOFF_TRACER
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
-	depends on DEBUG_KERNEL
 	select TRACE_IRQFLAGS
 	select TRACING
 	select TRACER_MAX_TRACE
@@ -114,7 +123,6 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	depends on DEBUG_KERNEL
 	select TRACING
 	select TRACER_MAX_TRACE
 	help
@@ -142,7 +150,6 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
@@ -152,7 +159,6 @@ config SCHED_TRACER
 
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select MARKERS
 	help
@@ -161,7 +167,6 @@ config CONTEXT_SWITCH_TRACER
 
 config EVENT_TRACER
 	bool "Trace various events in the kernel"
-	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
@@ -170,7 +175,6 @@ config EVENT_TRACER
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -188,7 +192,6 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
-	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer profiles all the the likely and unlikely macros
@@ -241,7 +244,6 @@ config BRANCH_TRACER
 
 config POWER_TRACER
 	bool "Trace power consumption behavior"
-	depends on DEBUG_KERNEL
 	depends on X86
 	select TRACING
 	help
@@ -253,7 +255,6 @@ config POWER_TRACER
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
-	depends on DEBUG_KERNEL
 	select FUNCTION_TRACER
 	select STACKTRACE
 	select KALLSYMS
@@ -343,7 +344,6 @@ config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
 	depends on HAVE_DYNAMIC_FTRACE
-	depends on DEBUG_KERNEL
 	default y
 	help
          This option will modify all the calls to ftrace dynamically
@@ -369,7 +369,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING && DEBUG_KERNEL
+	depends on TRACING
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -379,7 +379,7 @@ config FTRACE_STARTUP_TEST
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	depends on HAVE_MMIOTRACE_SUPPORT && PCI
 	select TRACING
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
@@ -401,3 +401,6 @@ config MMIOTRACE_TEST
 	  Say N, unless you absolutely know what you are doing.
 
 endmenu
+
+endif # TRACING_SUPPORT
+
-- 
cgit v1.1


From 5e2336a0d47c9661a40cc5ef85135ce1406af6e8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 21:44:55 -0500
Subject: tracing: make all file_operations const

Impact: cleanup

All file_operations structures should be constant. No one is going to
change them.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c        | 10 +++++-----
 kernel/trace/ring_buffer.c   |  2 +-
 kernel/trace/trace.c         | 24 ++++++++++++------------
 kernel/trace/trace_sysprof.c |  2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5a3a06b..d7a06a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1869,21 +1869,21 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
 	return ftrace_regex_release(inode, file, 0);
 }
 
-static struct file_operations ftrace_avail_fops = {
+static const struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = ftrace_avail_release,
 };
 
-static struct file_operations ftrace_failures_fops = {
+static const struct file_operations ftrace_failures_fops = {
 	.open = ftrace_failures_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = ftrace_avail_release,
 };
 
-static struct file_operations ftrace_filter_fops = {
+static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = ftrace_regex_read,
 	.write = ftrace_filter_write,
@@ -1891,7 +1891,7 @@ static struct file_operations ftrace_filter_fops = {
 	.release = ftrace_filter_release,
 };
 
-static struct file_operations ftrace_notrace_fops = {
+static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = ftrace_regex_read,
 	.write = ftrace_notrace_write,
@@ -2423,7 +2423,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations ftrace_pid_fops = {
+static const struct file_operations ftrace_pid_fops = {
 	.read = ftrace_pid_read,
 	.write = ftrace_pid_write,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f747364..1788584 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,7 +2606,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations rb_simple_fops = {
+static const struct file_operations rb_simple_fops = {
 	.open		= tracing_open_generic,
 	.read		= rb_simple_read,
 	.write		= rb_simple_write,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0e9c12..e6144ac 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1882,14 +1882,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations tracing_fops = {
+static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= tracing_release,
 };
 
-static struct file_operations show_traces_fops = {
+static const struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
 	.release	= seq_release,
@@ -1982,7 +1982,7 @@ err_unlock:
 	return err;
 }
 
-static struct file_operations tracing_cpumask_fops = {
+static const struct file_operations tracing_cpumask_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_cpumask_read,
 	.write		= tracing_cpumask_write,
@@ -2134,7 +2134,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations tracing_iter_fops = {
+static const struct file_operations tracing_iter_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_trace_options_read,
 	.write		= tracing_trace_options_write,
@@ -2167,7 +2167,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 					readme_msg, strlen(readme_msg));
 }
 
-static struct file_operations tracing_readme_fops = {
+static const struct file_operations tracing_readme_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_readme_read,
 };
@@ -2927,25 +2927,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations tracing_max_lat_fops = {
+static const struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
 	.write		= tracing_max_lat_write,
 };
 
-static struct file_operations tracing_ctrl_fops = {
+static const struct file_operations tracing_ctrl_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_ctrl_read,
 	.write		= tracing_ctrl_write,
 };
 
-static struct file_operations set_tracer_fops = {
+static const struct file_operations set_tracer_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_set_trace_read,
 	.write		= tracing_set_trace_write,
 };
 
-static struct file_operations tracing_pipe_fops = {
+static const struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
 	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
@@ -2953,13 +2953,13 @@ static struct file_operations tracing_pipe_fops = {
 	.release	= tracing_release_pipe,
 };
 
-static struct file_operations tracing_entries_fops = {
+static const struct file_operations tracing_entries_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_entries_read,
 	.write		= tracing_entries_write,
 };
 
-static struct file_operations tracing_mark_fops = {
+static const struct file_operations tracing_mark_fops = {
 	.open		= tracing_open_generic,
 	.write		= tracing_mark_write,
 };
@@ -3240,7 +3240,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 	return r;
 }
 
-static struct file_operations tracing_dyn_info_fops = {
+static const struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_read_dyn_info,
 };
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index c771af4..91fd19c 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -314,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations sysprof_sample_fops = {
+static const struct file_operations sysprof_sample_fops = {
 	.read		= sysprof_sample_read,
 	.write		= sysprof_sample_write,
 };
-- 
cgit v1.1


From 33b0c229e3abeae00493ed1d6f0b07191977a0a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 11:45:43 -0500
Subject: tracing: move print of event format to separate file

Impact: clean up

Move the macro that creates the event format file to a separate header.
This will allow the default ftrace events to use this same macro
to create the formats to read those events.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 53 +----------------------------------
 kernel/trace/trace_format.h         | 55 +++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 52 deletions(-)
 create mode 100644 kernel/trace/trace_format.h

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index b1cebba..d24a97e 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -75,56 +75,5 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 
 #include <trace/trace_event_types.h>
 
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- * 	struct ftrace_raw_##call field;
- * 	int ret;
- *
- * 	ret = trace_seq_printf(s, #type " " #item ";"
- * 			       " size:%d; offset:%d;\n",
- * 			       sizeof(field.type),
- * 			       offsetof(struct ftrace_raw_##call,
- * 					item));
- *
- * }
- */
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
+#include "trace_format.h"
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
new file mode 100644
index 0000000..53a6b13
--- /dev/null
+++ b/kernel/trace/trace_format.h
@@ -0,0 +1,55 @@
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
-- 
cgit v1.1


From 770cb24345c0f6e0d47bd2b94aa6d67bea6f8b54 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 21:35:29 -0500
Subject: tracing: add format files for ftrace default entries

Impact: allow user apps to read binary format of basic ftrace entries

Currently, only defined raw events export their formats so a binary
reader can parse them. There's no reason that the default ftrace entries
can't export their formats.

This patch adds a subsystem called "ftrace" in the events directory
that includes the ftrace entries for basic ftrace recorded items.

These only have three files in the events directory:

 type             : printf
 available_types  : printf
 format           : format for the event entry

For example:

 # cat /debug/tracing/events/ftrace/wakeup/format
name: wakeup
ID: 3
format:
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:unsigned int prev_pid;    offset:12;      size:4;
        field:unsigned char prev_prio;  offset:16;      size:1;
        field:unsigned char prev_state; offset:17;      size:1;
        field:unsigned int next_pid;    offset:20;      size:4;
        field:unsigned char next_prio;  offset:24;      size:1;
        field:unsigned char next_state; offset:25;      size:1;
        field:unsigned int next_cpu;    offset:28;      size:4;

print fmt: "%u:%u:%u  ==+ %u:%u:%u [%03u]"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Makefile            |   1 +
 kernel/trace/trace_event_types.h | 165 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c      |  12 +--
 kernel/trace/trace_export.c      |  81 +++++++++++++++++++
 kernel/trace/trace_format.h      |   2 +-
 5 files changed, 255 insertions(+), 6 deletions(-)
 create mode 100644 kernel/trace/trace_event_types.h
 create mode 100644 kernel/trace/trace_export.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c931fe0..f44736c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
+obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 0000000..fb4eba1
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,165 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM	ftrace
+
+/*
+ * We cheat and use the proto type field as the ID
+ * and args as the entry type (minus 'struct')
+ */
+TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
+	),
+	TPRAWFMT(" %lx <-- %lx")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
+		   ftrace_graph_ent_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, graph_ent.func, func)
+		TRACE_FIELD(int, graph_ent.depth, depth)
+	),
+	TPRAWFMT("--> %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
+		   ftrace_graph_ret_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ret.func, func)
+		TRACE_FIELD(int, ret.depth, depth)
+	),
+	TPRAWFMT("<-- %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+		TRACE_FIELD(unsigned char, prev_state, prev_state)
+		TRACE_FIELD(unsigned int, next_pid, next_pid)
+		TRACE_FIELD(unsigned char, next_prio, next_prio)
+		TRACE_FIELD(unsigned char, next_state, next_state)
+		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+	),
+	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+		TRACE_FIELD(unsigned char, prev_state, prev_state)
+		TRACE_FIELD(unsigned int, next_pid, next_pid)
+		TRACE_FIELD(unsigned char, next_prio, next_prio)
+		TRACE_FIELD(unsigned char, next_state, next_state)
+		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+	),
+	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, arg1, arg1)
+		TRACE_FIELD(unsigned long, arg2, arg2)
+		TRACE_FIELD(unsigned long, arg3, arg3)
+	),
+	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+/* #define FTRACE_STACK_ENTRIES   8 */
+
+TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, caller[0], stack0)
+		TRACE_FIELD(unsigned long, caller[1], stack1)
+		TRACE_FIELD(unsigned long, caller[2], stack2)
+		TRACE_FIELD(unsigned long, caller[3], stack3)
+		TRACE_FIELD(unsigned long, caller[4], stack4)
+		TRACE_FIELD(unsigned long, caller[5], stack5)
+		TRACE_FIELD(unsigned long, caller[6], stack6)
+		TRACE_FIELD(unsigned long, caller[7], stack7)
+	),
+	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, caller[0], stack0)
+		TRACE_FIELD(unsigned long, caller[1], stack1)
+		TRACE_FIELD(unsigned long, caller[2], stack2)
+		TRACE_FIELD(unsigned long, caller[3], stack3)
+		TRACE_FIELD(unsigned long, caller[4], stack4)
+		TRACE_FIELD(unsigned long, caller[5], stack5)
+		TRACE_FIELD(unsigned long, caller[6], stack6)
+		TRACE_FIELD(unsigned long, caller[7], stack7)
+	),
+	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TPRAWFMT("%08lx (%d) %s")
+);
+
+TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, line, line)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD(char, correct, correct)
+	),
+	TPRAWFMT("%u:%s:%s (%u)")
+);
+
+TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(u64, from, from)
+		TRACE_FIELD(u64, to, to)
+	),
+	TPRAWFMT("from: %llx to: %llx")
+);
+
+TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
+		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD(int, state_data.type, type)
+		TRACE_FIELD(int, state_data.state, state)
+	),
+	TPRAWFMT("%llx->%llx type:%u state:%u")
+);
+
+TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+		TRACE_FIELD(unsigned long, call_site, call_site)
+		TRACE_FIELD(const void *, ptr, ptr)
+		TRACE_FIELD(size_t, bytes_req, bytes_req)
+		TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
+		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
+		TRACE_FIELD(int, node, node)
+	),
+	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+		 " flags:%x node:%d")
+);
+
+TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+		TRACE_FIELD(unsigned long, call_site, call_site)
+		TRACE_FIELD(const void *, ptr, ptr)
+	),
+	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+);
+
+#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 210e71f..4488d90 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -656,11 +656,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	entry = debugfs_create_file("enable", 0644, call->dir, call,
-				    &ftrace_enable_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/enable' entry\n", call->name);
+	if (call->regfunc) {
+		entry = debugfs_create_file("enable", 0644, call->dir, call,
+					    &ftrace_enable_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs "
+				   "'%s/enable' entry\n", call->name);
+	}
 
 	/* Only let type be writable, if we can change it */
 	entry = debugfs_create_file("type",
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 0000000..0fb7be7
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,81 @@
+/*
+ * trace_export.c - export basic ftrace utilities to user space
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/stringify.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+#include "trace_format.h"
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)				\
+	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"	\
+			       "offset:%lu;\tsize:0;\n",	\
+			       offsetof(typeof(field), item));	\
+	if (!ret)						\
+		return 0;
+
+
+#undef TPRAWFMT
+#define TPRAWFMT(args...) args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
+#include "trace_event_types.h"
+
+#undef TRACE_ZERO_CHAR
+#define TRACE_ZERO_CHAR(arg)
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TPCMD
+#define TPCMD(cmd...)	cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY	entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	cmd;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.show_format		= ftrace_format_##call,			\
+}
+#include "trace_event_types.h"
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
index 53a6b13..03f9a4c 100644
--- a/kernel/trace/trace_format.h
+++ b/kernel/trace/trace_format.h
@@ -40,7 +40,7 @@
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
+static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
 	struct ftrace_raw_##call field;					\
-- 
cgit v1.1


From 422d3c7a577b15e1384c9d4e72a9540896b685fa Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 6 Mar 2009 10:40:53 +0900
Subject: tracing: current tip/master can't enable ftrace

After commit 40ada30f9621fbd831ac2437b9a2a399aad34b00,
"make menuconfig" doesn't display "Tracer" item.

Following modification restores it.
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d733da..058d949 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -61,6 +61,7 @@ config TRACING_SUPPORT
 	bool
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on STACKTRACE_SUPPORT
+	default y
 
 if TRACING_SUPPORT
 
-- 
cgit v1.1


From 10dd3ebe213c31bff14b4dae3c5d32a76b1fad7c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 6 Mar 2009 15:29:04 +0900
Subject: tracing: fix deadlock when setting set_ftrace_pid

Impact: fix deadlock while using set_ftrace_pid

Reproducer:

	# cd /sys/kernel/debug/tracing
	# echo $$ > set_ftrace_pid

	then, console becomes hung.

Details:

when writing set_ftracepid, kernel callstack is following

	ftrace_pid_write()
		mutex_lock(&ftrace_lock);
		ftrace_update_pid_func()
			mutex_lock(&ftrace_lock);
			mutex_unlock(&ftrace_lock);
		mutex_unlock(&ftrace_lock);

then, system always deadlocks when ftrace_pid_write() is called.

In past days, ftrace_pid_write() used ftrace_start_lock, but
commit e6ea44e9b4c12325337cd1c06103cd515a1c02b2 consolidated
ftrace_start_lock to ftrace_lock.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <20090306151155.0778.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d7a06a0..d33d306 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -218,10 +218,8 @@ static void ftrace_update_pid_func(void)
 {
 	ftrace_func_t func;
 
-	mutex_lock(&ftrace_lock);
-
 	if (ftrace_trace_function == ftrace_stub)
-		goto out;
+		return;
 
 	func = ftrace_trace_function;
 
@@ -238,9 +236,6 @@ static void ftrace_update_pid_func(void)
 #else
 	__ftrace_trace_function = func;
 #endif
-
- out:
-	mutex_unlock(&ftrace_lock);
 }
 
 /* set when tracing only a pid */
-- 
cgit v1.1


From 4460fdad85becd569f11501ad5b91814814335ff Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 6 Mar 2009 10:36:38 -0500
Subject: tracing, Text Edit Lock - kprobes architecture independent support

Use the mutual exclusion provided by the text edit lock in the kprobes code. It
allows coherent manipulation of the kernel code by other subsystems.

Changelog:

Move the kernel_text_lock/unlock out of the for loops.
Use text_mutex directly instead of a function.
Remove whitespace modifications.

(note : kprobes_mutex is always taken outside of text_mutex)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
LKML-Reference: <49B14306.2080202@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7ba8cd9..479d4d5 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kdebug.h>
+#include <linux/memory.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kprobe *p)
 		goto out;
 	}
 
+	mutex_lock(&text_mutex);
 	ret = arch_prepare_kprobe(p);
 	if (ret)
-		goto out;
+		goto out_unlock_text;
 
 	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head_rcu(&p->hlist,
@@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 	if (kprobe_enabled)
 		arch_arm_kprobe(p);
 
+out_unlock_text:
+	mutex_unlock(&text_mutex);
 out:
 	mutex_unlock(&kprobe_mutex);
 
@@ -746,8 +750,11 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled && !kprobe_gone(old_p))
+		if (kprobe_enabled && !kprobe_gone(old_p)) {
+			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
+			mutex_unlock(&text_mutex);
+		}
 		hlist_del_rcu(&old_p->hlist);
 	} else {
 		if (p->break_handler && !kprobe_gone(p))
@@ -1280,12 +1287,14 @@ static void __kprobes enable_all_kprobes(void)
 	if (kprobe_enabled)
 		goto already_enabled;
 
+	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
 			if (!kprobe_gone(p))
 				arch_arm_kprobe(p);
 	}
+	mutex_unlock(&text_mutex);
 
 	kprobe_enabled = true;
 	printk(KERN_INFO "Kprobes globally enabled\n");
@@ -1310,6 +1319,7 @@ static void __kprobes disable_all_kprobes(void)
 
 	kprobe_enabled = false;
 	printk(KERN_INFO "Kprobes globally disabled\n");
+	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
@@ -1318,6 +1328,7 @@ static void __kprobes disable_all_kprobes(void)
 		}
 	}
 
+	mutex_unlock(&text_mutex);
 	mutex_unlock(&kprobe_mutex);
 	/* Allow all currently running kprobes to complete */
 	synchronize_sched();
-- 
cgit v1.1


From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:47 +0100
Subject: tracing: infrastructure for supporting binary record

Impact: save on memory for tracing

Current tracers are typically using a struct(like struct ftrace_entry,
struct ctx_switch_entry, struct special_entr etc...)to record a binary
event. These structs can only record a their own kind of events.
A new kind of tracer need a new struct and a lot of code too handle it.

So we need a generic binary record for events. This infrastructure
is for this purpose.

[fweisbec@gmail.com: rebase against latest -tip, make it safe while sched
tracing as reported by Steven Rostedt]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig         |  6 +++
 kernel/trace/Makefile        |  1 +
 kernel/trace/trace.c         | 56 ++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 12 ++++++
 kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c  | 75 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 237 insertions(+)
 create mode 100644 kernel/trace/trace_bprintk.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 058d949..ad8d361 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+config TRACE_BPRINTK
+	bool "Binary printk for tracing"
+	default y
+	depends on TRACING
+	select BINARY_PRINTF
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f44736c..46557ef 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6144ac..ff53509 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ * Caller must insure @fmt are valid when msg is in tracing buffer.
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct bprintk_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (tracing_disabled || !trace_bprintk_enable)
+		return 0;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8beff03..0f5077f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -124,6 +125,16 @@ struct print_entry {
 	char			buf[];
 };
 
+struct bprintk_entry {
+	struct trace_entry ent;
+	unsigned long ip;
+	const char *fmt;
+	u32 buf[];
+};
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_bprintk_enable;
+#endif
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
new file mode 100644
index 0000000..1f8e532
--- /dev/null
+++ b/kernel/trace/trace_bprintk.c
@@ -0,0 +1,87 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+/* binary printk basic */
+static DEFINE_MUTEX(btrace_mutex);
+static int btrace_metadata_count;
+
+static inline void lock_btrace(void)
+{
+	mutex_lock(&btrace_mutex);
+}
+
+static inline void unlock_btrace(void)
+{
+	mutex_unlock(&btrace_mutex);
+}
+
+static void get_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count++;
+	unlock_btrace();
+}
+
+static void put_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count--;
+	unlock_btrace();
+}
+
+/* events tracer */
+int trace_bprintk_enable;
+
+static void start_bprintk_trace(struct trace_array *tr)
+{
+	get_btrace_metadata();
+	tracing_reset_online_cpus(tr);
+	trace_bprintk_enable = 1;
+}
+
+static void stop_bprintk_trace(struct trace_array *tr)
+{
+	trace_bprintk_enable = 0;
+	tracing_reset_online_cpus(tr);
+	put_btrace_metadata();
+}
+
+static int init_bprintk_trace(struct trace_array *tr)
+{
+	start_bprintk_trace(tr);
+	return 0;
+}
+
+static struct tracer bprintk_trace __read_mostly =
+{
+	.name	     = "events",
+	.init	     = init_bprintk_trace,
+	.reset	     = stop_bprintk_trace,
+	.start	     = start_bprintk_trace,
+	.stop	     = stop_bprintk_trace,
+};
+
+static __init int init_bprintk(void)
+{
+	return register_tracer(&bprintk_trace);
+}
+
+device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 306fef8..4ab7120 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
+static int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
 /**
  * trace_seq_puts - trace sequence printing of simple string
  * @s: trace sequence descriptor
@@ -855,6 +875,60 @@ static struct trace_event trace_print_event = {
 	.raw		= trace_print_raw,
 };
 
+/* TRACE_BPRINTK */
+static enum print_line_t
+trace_bprintk_print(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+trace_bprintk_raw(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_bprintk_event = {
+	.type	 	= TRACE_BPRINTK,
+	.trace		= trace_bprintk_print,
+	.raw		= trace_bprintk_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
+	&trace_bprintk_event,
 	NULL
 };
 
-- 
cgit v1.1


From 1ba28e02a18cbdbea123836f6c98efb09cbf59ec Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:48 +0100
Subject: tracing: add trace_bprintk()

Impact: add a generic printk() for tracing, like trace_printk()

trace_bprintk() uses the infrastructure to record events on ring_buffer.

[ fweisbec@gmail.com: ported to latest -tip, made it work if
  !CONFIG_MODULES, never free the format strings from modules
  because we can't keep track of them and conditionnaly create
  the ftrace format strings section (reported by Steven Rostedt) ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c              |  6 +++
 kernel/trace/trace.c         | 15 ++++++++
 kernel/trace/trace_bprintk.c | 87 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 98 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 22d7379..2dece10 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,6 +2158,12 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
+			"__trace_bprintk_fmt", sizeof(char *),
+			&mod->num_trace_bprintk_fmt);
+#endif
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff53509..46b3cd7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3848,6 +3848,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!fmt)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vbprintk(ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
index 1f8e532..f4c245a 100644
--- a/kernel/trace/trace_bprintk.c
+++ b/kernel/trace/trace_bprintk.c
@@ -19,9 +19,21 @@
 
 #include "trace.h"
 
+#ifdef CONFIG_MODULES
+
 /* binary printk basic */
 static DEFINE_MUTEX(btrace_mutex);
-static int btrace_metadata_count;
+/*
+ * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
 
 static inline void lock_btrace(void)
 {
@@ -33,26 +45,75 @@ static inline void unlock_btrace(void)
 	mutex_unlock(&btrace_mutex);
 }
 
-static void get_btrace_metadata(void)
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
-	lock_btrace();
-	btrace_metadata_count++;
-	unlock_btrace();
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
 }
 
-static void put_btrace_metadata(void)
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
+	const char **iter;
 	lock_btrace();
-	btrace_metadata_count--;
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
 	unlock_btrace();
 }
 
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
 /* events tracer */
 int trace_bprintk_enable;
 
 static void start_bprintk_trace(struct trace_array *tr)
 {
-	get_btrace_metadata();
 	tracing_reset_online_cpus(tr);
 	trace_bprintk_enable = 1;
 }
@@ -61,7 +122,6 @@ static void stop_bprintk_trace(struct trace_array *tr)
 {
 	trace_bprintk_enable = 0;
 	tracing_reset_online_cpus(tr);
-	put_btrace_metadata();
 }
 
 static int init_bprintk_trace(struct trace_array *tr)
@@ -81,7 +141,14 @@ static struct tracer bprintk_trace __read_mostly =
 
 static __init int init_bprintk(void)
 {
-	return register_tracer(&bprintk_trace);
+	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
+	if (ret)
+		return ret;
+
+	ret = register_tracer(&bprintk_trace);
+	if (ret)
+		unregister_module_notifier(&module_trace_bprintk_format_nb);
+	return ret;
 }
 
 device_initcall(init_bprintk);
-- 
cgit v1.1


From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Mar 2009 17:21:49 +0100
Subject: tracing/core: drop the old trace_printk() implementation in favour of
 trace_bprintk()

Impact: faster and lighter tracing

Now that we have trace_bprintk() which is faster and consume lesser
memory than trace_printk() and has the same purpose, we can now drop
the old implementation in favour of the binary one from trace_bprintk(),
which means we move all the implementation of trace_bprintk() to
trace_printk(), so the Api doesn't change except that we must now use
trace_seq_bprintk() to print the TRACE_PRINT entries.

Some changes result of this:

- Previously, trace_bprintk depended of a single tracer and couldn't
  work without. This tracer has been dropped and the whole implementation
  of trace_printk() (like the module formats management) is now integrated
  in the tracing core (comes with CONFIG_TRACING), though we keep the file
  trace_printk (previously trace_bprintk.c) where we can find the module
  management. Thus we don't overflow trace.c

- changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries.

- change a bit trace_printk/trace_vprintk macros to support non-builtin formats
  constants, and fix 'const' qualifiers warnings. But this is all transparent for
  developers.

- etc...

V2:

- Rebase against last changes
- Fix mispell on the changelog

V3:

- Rebase against last changes (moving trace_printk() to kernel.h)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig                 |   7 +-
 kernel/trace/Makefile                |   2 +-
 kernel/trace/trace.c                 | 212 ++++++++++-------------------------
 kernel/trace/trace.h                 |  14 +--
 kernel/trace/trace_bprintk.c         | 154 -------------------------
 kernel/trace/trace_functions_graph.c |   6 +-
 kernel/trace/trace_mmiotrace.c       |   9 +-
 kernel/trace/trace_output.c          |  70 ++----------
 kernel/trace/trace_output.h          |   2 +
 kernel/trace/trace_printk.c          | 138 +++++++++++++++++++++++
 10 files changed, 230 insertions(+), 384 deletions(-)
 delete mode 100644 kernel/trace/trace_bprintk.c
 create mode 100644 kernel/trace/trace_printk.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ad8d361..8e4a2a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config TRACING
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select TRACEPOINTS
 	select NOP_TRACER
+	select BINARY_PRINTF
 
 #
 # Minimum requirements an architecture has to meet for us to
@@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
-config TRACE_BPRINTK
-	bool "Binary printk for tracing"
-	default y
-	depends on TRACING
-	select BINARY_PRINTF
-
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46557ef..c7a2943 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
-obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
+obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46b3cd7..cc94f86 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+
+/**
+ * trace_vprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct print_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (unlikely(tracing_selftest_running || tracing_disabled))
+		return 0;
+
+	/* Don't pollute graph traces with trace_vprintk internals */
+	pause_graph_tracing();
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+	unpause_graph_tracing();
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
@@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%s", field->buf);
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
-{
-	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
-	static char trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	int cpu, len = 0, size, pc;
-	struct print_entry *entry;
-	unsigned long irq_flags;
-
-	if (tracing_disabled || tracing_selftest_running)
-		return 0;
-
-	pc = preempt_count();
-	preempt_disable_notrace();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	pause_graph_tracing();
-	raw_local_irq_save(irq_flags);
-	__raw_spin_lock(&trace_buf_lock);
-	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	len = min(len, TRACE_BUF_SIZE-1);
-	trace_buf[len] = 0;
-
-	size = sizeof(*entry) + len + 1;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->depth			= depth;
-
-	memcpy(&entry->buf, trace_buf, len);
-	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
- out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	raw_local_irq_restore(irq_flags);
-	unpause_graph_tracing();
- out:
-	preempt_enable_notrace();
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
-int __trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_printk);
-
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
-{
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-}
-EXPORT_SYMBOL_GPL(__ftrace_vprintk);
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- *
- * Caller must insure @fmt are valid when msg is in tracing buffer.
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
-	static DEFINE_SPINLOCK(trace_buf_lock);
-	static u32 trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	struct bprintk_entry *entry;
-	unsigned long flags;
-	int resched;
-	int cpu, len = 0, size, pc;
-
-	if (tracing_disabled || !trace_bprintk_enable)
-		return 0;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	spin_lock_irqsave(&trace_buf_lock, flags);
-	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	if (len > TRACE_BUF_SIZE || len < 0)
-		goto out_unlock;
-
-	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->fmt			= fmt;
-
-	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
-
-out:
-	ftrace_preempt_enable(resched);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!fmt)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_bprintk);
-
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0f5077f..6140922 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,7 +20,6 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
-	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -120,16 +119,10 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long		ip;
+	unsigned long 		ip;
 	int			depth;
-	char			buf[];
-};
-
-struct bprintk_entry {
-	struct trace_entry ent;
-	unsigned long ip;
-	const char *fmt;
-	u32 buf[];
+	const char		*fmt;
+	u32 			buf[];
 };
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_bprintk_enable;
@@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
-		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
deleted file mode 100644
index f4c245a..0000000
--- a/kernel/trace/trace_bprintk.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * trace binary printk
- *
- * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/ftrace.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
-
-#include "trace.h"
-
-#ifdef CONFIG_MODULES
-
-/* binary printk basic */
-static DEFINE_MUTEX(btrace_mutex);
-/*
- * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
- * which are queued on trace_bprintk_fmt_list.
- */
-static LIST_HEAD(trace_bprintk_fmt_list);
-
-struct trace_bprintk_fmt {
-	struct list_head list;
-	char fmt[0];
-};
-
-
-static inline void lock_btrace(void)
-{
-	mutex_lock(&btrace_mutex);
-}
-
-static inline void unlock_btrace(void)
-{
-	mutex_unlock(&btrace_mutex);
-}
-
-
-static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
-{
-	struct trace_bprintk_fmt *pos;
-	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
-		if (!strcmp(pos->fmt, fmt))
-			return pos;
-	}
-	return NULL;
-}
-
-static
-void hold_module_trace_bprintk_format(const char **start, const char **end)
-{
-	const char **iter;
-	lock_btrace();
-	for (iter = start; iter < end; iter++) {
-		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
-		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
-			continue;
-		}
-
-		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
-				+ strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(tb_fmt->fmt, *iter);
-			*iter = tb_fmt->fmt;
-		} else
-			*iter = NULL;
-	}
-	unlock_btrace();
-}
-
-static int module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
-
-		if (val == MODULE_STATE_COMING)
-			hold_module_trace_bprintk_format(start, end);
-	}
-	return 0;
-}
-
-#else /* !CONFIG_MODULES */
-__init static int
-module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULES */
-
-
-__initdata_or_module static
-struct notifier_block module_trace_bprintk_format_nb = {
-	.notifier_call = module_trace_bprintk_format_notify,
-};
-
-/* events tracer */
-int trace_bprintk_enable;
-
-static void start_bprintk_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	trace_bprintk_enable = 1;
-}
-
-static void stop_bprintk_trace(struct trace_array *tr)
-{
-	trace_bprintk_enable = 0;
-	tracing_reset_online_cpus(tr);
-}
-
-static int init_bprintk_trace(struct trace_array *tr)
-{
-	start_bprintk_trace(tr);
-	return 0;
-}
-
-static struct tracer bprintk_trace __read_mostly =
-{
-	.name	     = "events",
-	.init	     = init_bprintk_trace,
-	.reset	     = stop_bprintk_trace,
-	.start	     = start_bprintk_trace,
-	.stop	     = stop_bprintk_trace,
-};
-
-static __init int init_bprintk(void)
-{
-	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
-	if (ret)
-		return ret;
-
-	ret = register_tracer(&bprintk_trace);
-	if (ret)
-		unregister_module_notifier(&module_trace_bprintk_format_nb);
-	return ret;
-}
-
-device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e527f2f..453ebd3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 		}
 
 	/* The comment */
-	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	ret = trace_seq_printf(s, "/* ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c401b90..23e346a 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
-	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, print->fmt, print->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4ab7120..ef8fd66 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
-static int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
 	int ret;
@@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static enum print_line_t trace_print_print(struct trace_iterator *iter,
-					   int flags)
-{
-	struct print_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (!trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
-	.trace		= trace_print_print,
-	.raw		= trace_print_raw,
-};
-
-/* TRACE_BPRINTK */
 static enum print_line_t
-trace_bprintk_print(struct trace_iterator *iter, int flags)
+trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
+	struct print_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t
-trace_bprintk_raw(struct trace_iterator *iter, int flags)
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
-	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!trace_seq_printf(s, ": %lx : ", field->ip))
 		goto partial;
@@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_bprintk_event = {
-	.type	 	= TRACE_BPRINTK,
-	.trace		= trace_bprintk_print,
-	.raw		= trace_bprintk_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.raw		= trace_print_raw,
 };
 
 static struct trace_event *events[] __initdata = {
@@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
-	&trace_bprintk_event,
 	NULL
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8a34d68..3b90e6a 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -18,6 +18,8 @@ struct trace_event {
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 0000000..a50aea2
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,138 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+	const char **iter;
+
+	mutex_lock(&btrace_mutex);
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
+	mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+ {
+	int ret;
+	va_list ap;
+
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+
+static __init int init_trace_printk(void)
+{
+	return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
-- 
cgit v1.1


From 9de36825b321fe9fe9cf73260554251af579f4ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 6 Mar 2009 17:52:03 +0100
Subject: tracing: trace_bprintk() cleanups

Impact: cleanup

Remove a few leftovers and clean up the code a bit.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c      |  6 ------
 kernel/trace/trace.h | 19 ++++++++-----------
 2 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2dece10..22d7379 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,12 +2158,6 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
-	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
-			"__trace_bprintk_fmt", sizeof(char *),
-			&mod->num_trace_bprintk_fmt);
-#endif
-
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6140922..2bfb7d1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -119,14 +119,11 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long 		ip;
+	unsigned long		ip;
 	int			depth;
 	const char		*fmt;
-	u32 			buf[];
+	u32			buf[];
 };
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_bprintk_enable;
-#endif
 
 #define TRACE_OLD_SIZE		88
 
@@ -199,7 +196,7 @@ struct kmemtrace_free_entry {
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
  *  IRQS_OFF		- interrupts were disabled
- *  IRQS_NOSUPPORT 	- arch does not support irqs_disabled_flags
+ *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags
  *  NEED_RESCED		- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
@@ -302,7 +299,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -325,8 +322,8 @@ enum print_line_t {
  * flags value in struct tracer_flags.
  */
 struct tracer_opt {
-	const char 	*name; /* Will appear on the trace_options file */
-	u32 		bit; /* Mask assigned in val field in tracer_flags */
+	const char	*name; /* Will appear on the trace_options file */
+	u32		bit; /* Mask assigned in val field in tracer_flags */
 };
 
 /*
@@ -335,7 +332,7 @@ struct tracer_opt {
  */
 struct tracer_flags {
 	u32			val;
-	struct tracer_opt 	*opts;
+	struct tracer_opt	*opts;
 };
 
 /* Makes more easy to define a tracer opt */
@@ -390,7 +387,7 @@ struct tracer {
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
 	int			print_max;
-	struct tracer_flags 	*flags;
+	struct tracer_flags	*flags;
 	struct tracer_stat	*stats;
 };
 
-- 
cgit v1.1


From 888b55dc314d26239d84c3b187dae555a81c1605 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sun, 8 Mar 2009 13:12:43 +0900
Subject: ftrace: tracing header should put '#' at the beginning of a line

In a recent discussion, Andrew Morton pointed out that tracing header
should put '#' at the beginning of a line.

Then, we can easily filtered the header by following grep usage:

  cat trace | grep -v '^#'

Wakeup trace also has the same header problem.

Comparison of headers displayed:

before this patch:

 # tracer: wakeup
 #
 wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip
 --------------------------------------------------------------------
  latency: 19059 us, #21277/21277, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
     -----------------
     | task: kondemand/1-1644 (uid:0 nice:-5 policy:0 rt_prio:0)
     -----------------

 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /
 #                |||||     delay
 #  cmd     pid   ||||| time  |   caller
 #     \   /      |||||   \   |   /
 irqbalan-1887    1d.s.    0us :   1887:120:R   + [001]  1644:115:S kondemand/1
 irqbalan-1887    1d.s.    1us : default_wake_function <-autoremove_wake_function
 irqbalan-1887    1d.s.    2us : check_preempt_wakeup <-try_to_wake_up

after this patch:

 # tracer: wakeup
 #
 # wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip
 # --------------------------------------------------------------------
 # latency: 529 us, #530/530, CPU#0 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
 #    -----------------
 #    | task: kondemand/0-1641 (uid:0 nice:-5 policy:0 rt_prio:0)
 #    -----------------
 #
 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /
 #                |||||     delay
 #  cmd     pid   ||||| time  |   caller
 #     \   /      |||||   \   |   /
     sshd-2496    0d.s.    0us :   2496:120:R   + [000]  1641:115:S kondemand/0
     sshd-2496    0d.s.    1us : default_wake_function <-autoremove_wake_function
     sshd-2496    0d.s.    1us : check_preempt_wakeup <-try_to_wake_up

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090308124421.23C3.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f86..e5b5619 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1466,11 +1466,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	total = entries +
 		ring_buffer_overruns(iter->tr->buffer);
 
-	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
 		   name, UTS_RELEASE);
-	seq_puts(m, "-----------------------------------"
+	seq_puts(m, "# -----------------------------------"
 		 "---------------------------------\n");
-	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+	seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
 		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
 		   nsecs_to_usecs(data->saved_latency),
 		   entries,
@@ -1492,24 +1492,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 #else
 	seq_puts(m, ")\n");
 #endif
-	seq_puts(m, "    -----------------\n");
-	seq_printf(m, "    | task: %.16s-%d "
+	seq_puts(m, "#    -----------------\n");
+	seq_printf(m, "#    | task: %.16s-%d "
 		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
 		   data->comm, data->pid, data->uid, data->nice,
 		   data->policy, data->rt_priority);
-	seq_puts(m, "    -----------------\n");
+	seq_puts(m, "#    -----------------\n");
 
 	if (data->critical_start) {
-		seq_puts(m, " => started at: ");
+		seq_puts(m, "#  => started at: ");
 		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
 		trace_print_seq(m, &iter->seq);
-		seq_puts(m, "\n => ended at:   ");
+		seq_puts(m, "\n#  => ended at:   ");
 		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
 		trace_print_seq(m, &iter->seq);
-		seq_puts(m, "\n");
+		seq_puts(m, "#\n");
 	}
 
-	seq_puts(m, "\n");
+	seq_puts(m, "#\n");
 }
 
 static void test_cpu_buff_start(struct trace_iterator *iter)
-- 
cgit v1.1


From c3ffc7a40b7e94b094efe1c8ab4e24370a782b65 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 9 Mar 2009 18:15:34 +0900
Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer

Impact: improve workqueue tracer output

Currently, /sys/kernel/debug/tracing/trace_stat/workqueues can display
wrong and strange thread names.

Why?

Currently, ftrace has tracing_record_cmdline()/trace_find_cmdline()
convenience function that implements a task->comm string cache.

This can avoid unnecessary memcpy overhead and the workqueue tracer
uses it.

However, in general, any trace statistics feature shouldn't use
tracing_record_cmdline() because trace statistics can display
very old process. Then comm cache can return wrong string because
recent process overrides the cache.

Fortunately, workqueue trace guarantees that displayed processes
are live. Thus we can search comm string from PID at display time.

<before>

% cat workqueues
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   7 431913     431913       kondemand/7
   7      0          0       tail
   7     21         21       git
   7      0          0       ls
   7      9          9       cat
   7 832632     832632       unix_chkpwd
   7 236292     236292       ls

Note: tail, git, ls, cat unix_chkpwd are obiously not workqueue thread.

<after>

% cat workqueues
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   7    510        510       kondemand/7
   7      0          0       kmpathd/7
   7     15         15       ata/7
   7      0          0       aio/7
   7     11         11       kblockd/7
   7   1063       1063       work_on_cpu/7
   7    167        167       events/7

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990..46c8dc8 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -99,8 +99,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 		pr_warning("trace_workqueue: not enough memory\n");
 		return;
 	}
-	tracing_record_cmdline(wq_thread);
-
 	INIT_LIST_HEAD(&cws->list);
 	cws->cpu = cpu;
 
@@ -195,11 +193,12 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct cpu_workqueue_stats *cws = p;
 	unsigned long flags;
 	int cpu = cws->cpu;
+	struct task_struct *tsk = find_task_by_vpid(cws->pid);
 
 	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
 		   atomic_read(&cws->inserted),
 		   cws->executed,
-		   trace_find_cmdline(cws->pid));
+		   tsk ? tsk->comm : "<...>");
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-- 
cgit v1.1


From 156b5f172a64103bcb13b6d26288388b9019caa3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Mar 2009 10:50:53 -0500
Subject: tracing: typecast sizeof and offsetof to unsigned int

Impact: fix compiler warnings

On x86_64 sizeof and offsetof are treated as long, where as on x86_32
they are int. This patch typecasts them to unsigned int to avoid
one arch giving warnings while the other does not.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 15 ++++++++-------
 kernel/trace/trace_export.c | 10 +++++-----
 kernel/trace/trace_format.h | 12 ++++++------
 3 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4488d90..fa32ca3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -448,8 +448,9 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 }
 
 #undef FIELD
-#define FIELD(type, name) \
-	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+#define FIELD(type, name)						\
+	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
+		(unsigned int)sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -457,11 +458,11 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
 				"\n",
 				FIELD(unsigned char, type),
 				FIELD(unsigned char, flags),
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0fb7be7..7162ab4 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,11 +18,11 @@
 #include "trace_format.h"
 
 #undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)				\
-	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"	\
-			       "offset:%lu;\tsize:0;\n",	\
-			       offsetof(typeof(field), item));	\
-	if (!ret)						\
+#define TRACE_FIELD_ZERO_CHAR(item)					\
+	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"		\
+			       "offset:%u;\tsize:0;\n",			\
+			       (unsigned int)offsetof(typeof(field), item)); \
+	if (!ret)							\
 		return 0;
 
 
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
index 03f9a4c..97e59a9 100644
--- a/kernel/trace/trace_format.h
+++ b/kernel/trace/trace_format.h
@@ -22,9 +22,9 @@
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
 	if (!ret)							\
 		return 0;
 
@@ -32,9 +32,9 @@
 #undef TRACE_FIELD_SPECIAL
 #define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
 	if (!ret)							\
 		return 0;
 
-- 
cgit v1.1


From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 15:47:18 -0400
Subject: tracing: replace TP<var> with TP_<var>

Impact: clean up

The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit
ugly. This patch adds an underscore to their names.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_event_types.h    | 28 +++++++++----------
 kernel/trace/trace_events_stage_2.h |  6 ++--
 kernel/trace/trace_events_stage_3.h |  8 +++---
 kernel/trace/trace_export.c         |  8 +++---
 kernel/trace/trace_format.h         | 55 -------------------------------------
 5 files changed, 25 insertions(+), 80 deletions(-)
 delete mode 100644 kernel/trace/trace_format.h

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fb4eba1..d94179a 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
 	),
-	TPRAWFMT(" %lx <-- %lx")
+	TP_RAW_FMT(" %lx <-- %lx")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
@@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
 		TRACE_FIELD(unsigned long, graph_ent.func, func)
 		TRACE_FIELD(int, graph_ent.depth, depth)
 	),
-	TPRAWFMT("--> %lx (%d)")
+	TP_RAW_FMT("--> %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
@@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
 		TRACE_FIELD(unsigned long, ret.func, func)
 		TRACE_FIELD(int, ret.depth, depth)
 	),
-	TPRAWFMT("<-- %lx (%d)")
+	TP_RAW_FMT("<-- %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
@@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
@@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
@@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
 		TRACE_FIELD(unsigned long, arg2, arg2)
 		TRACE_FIELD(unsigned long, arg3, arg3)
 	),
-	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
 );
 
 /*
@@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TPRAWFMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
@@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
 		TRACE_FIELD(char, correct, correct)
 	),
-	TPRAWFMT("%u:%s:%s (%u)")
+	TP_RAW_FMT("%u:%s:%s (%u)")
 );
 
 TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
@@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 		TRACE_FIELD(u64, from, from)
 		TRACE_FIELD(u64, to, to)
 	),
-	TPRAWFMT("from: %llx to: %llx")
+	TP_RAW_FMT("from: %llx to: %llx")
 );
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
@@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
-	TPRAWFMT("%llx->%llx type:%u state:%u")
+	TP_RAW_FMT("%llx->%llx type:%u state:%u")
 );
 
 TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
@@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
 		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
 		TRACE_FIELD(int, node, node)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
 		 " flags:%x node:%d")
 );
 
@@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 		TRACE_FIELD(unsigned long, call_site, call_site)
 		TRACE_FIELD(const void *, ptr, ptr)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d24a97e..8e2e0f5 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -44,8 +44,8 @@
 	field->item,
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...)	args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...)	args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2c8d76c..557ca52 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -106,8 +106,8 @@
  *
  */
 
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
 
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
@@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7162ab4..e62bc10 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,8 +26,8 @@
 		return 0;
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...) args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...) args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
@@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
deleted file mode 100644
index 97e59a9..0000000
--- a/kernel/trace/trace_format.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
- *
- * }
- */
-
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
-- 
cgit v1.1


From 9cc26a261d43e5898287a1f5808132f8f05ceb1c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 16:00:22 -0400
Subject: tracing: use generic __stringify

Impact: clean up

This removes the custom made STR(x) macros in the tracer and uses
the generic __stringify macro instead.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c               | 4 +---
 kernel/trace/trace_events_stage_3.h | 4 ++--
 kernel/trace/trace_selftest.c       | 6 ++----
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index f2509cb..9fc918d 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -2,9 +2,7 @@
  * This is the place to register all trace points as events.
  */
 
-/* someday this needs to go in a generic header */
-#define __STR(x) #x
-#define STR(x) __STR(x)
+#include <linux/stringify.h>
 
 #include <trace/trace_events.h>
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 557ca52..41b82b9 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -139,7 +139,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
+	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
@@ -225,7 +225,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
+	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 7238646..f907a2b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
 /* Include in trace.c */
 
+#include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
@@ -100,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define __STR(x) #x
-#define STR(x) __STR(x)
-
 /* Test dynamic code modification and ftrace filters */
 int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 					   struct trace_array *tr,
@@ -130,7 +128,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	 * start of the function names. We simply put a '*' to
 	 * accommodate them.
 	 */
-	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+	func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
 
 	/* filter only on our function */
 	ftrace_set_filter(func_name, strlen(func_name), 1);
-- 
cgit v1.1


From da4d03020c2af32f73e8bfbab0a66620d85bb9bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 17:14:30 -0400
Subject: tracing: new format for specialized trace points

Impact: clean up and enhancement

The TRACE_EVENT_FORMAT macro looks quite ugly and is limited in its
ability to save data as well as to print the record out. Working with
Ingo Molnar, we came up with a new format that is much more pleasing to
the eye of C developers. This new macro is more C style than the old
macro, and is more obvious to what it does.

Here's the example. The only updated macro in this patch is the
sched_switch trace point.

The old method looked like this:

 TRACE_EVENT_FORMAT(sched_switch,
        TP_PROTO(struct rq *rq, struct task_struct *prev,
                struct task_struct *next),
        TP_ARGS(rq, prev, next),
        TP_FMT("task %s:%d ==> %s:%d",
              prev->comm, prev->pid, next->comm, next->pid),
        TRACE_STRUCT(
                TRACE_FIELD(pid_t, prev_pid, prev->pid)
                TRACE_FIELD(int, prev_prio, prev->prio)
                TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
                                    next_comm,
                                    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
                                                 next->comm,
                                                 TASK_COMM_LEN)))
                TRACE_FIELD(pid_t, next_pid, next->pid)
                TRACE_FIELD(int, next_prio, next->prio)
        ),
        TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
        );

The above method is hard to read and requires two format fields.

The new method:

 /*
  * Tracepoint for task switches, performed by the scheduler:
  *
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,

	TP_PROTO(struct rq *rq, struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(rq, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
	),

	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
		__entry->next_comm, __entry->next_pid, __entry->next_prio),

	TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
	)
 );

This macro is called TRACE_EVENT, it is broken up into 5 parts:

 TP_PROTO:        the proto type of the trace point
 TP_ARGS:         the arguments of the trace point
 TP_STRUCT_entry: the structure layout of the entry in the ring buffer
 TP_printk:       the printk format
 TP_fast_assign:  the method used to write the entry into the ring buffer

The structure is the definition of how the event will be saved in the
ring buffer. The printk is used by the internal tracing in case of
an oops, and the kernel needs to print out the format of the record
to the console. This the TP_printk gives a means to show the records
in a human readable format. It is also used to print out the data
from the trace file.

The TP_fast_assign is executed directly. It is basically like a C function,
where the __entry is the handle to the record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h                |   5 --
 kernel/trace/trace_event_types.h    |   3 +-
 kernel/trace/trace_events.c         | 159 +-----------------------------------
 kernel/trace/trace_events_stage_1.h |  28 ++++---
 kernel/trace/trace_events_stage_2.h |  89 ++++++++++++++++----
 kernel/trace/trace_events_stage_3.h |  34 +++-----
 kernel/trace/trace_export.c         |  23 +++++-
 7 files changed, 124 insertions(+), 217 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2bfb7d1..c5e1d88 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -751,12 +751,7 @@ struct ftrace_event_call {
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
 	int		id;
-	struct dentry	*raw_dir;
-	int		raw_enabled;
-	int		type;
 	int		(*raw_init)(void);
-	int		(*raw_reg)(void);
-	void		(*raw_unreg)(void);
 	int		(*show_format)(struct trace_seq *s);
 };
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d94179a..5cca4c9 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -106,9 +106,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TP_RAW_FMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fa32ca3..1880a64 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -59,22 +59,12 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-		}
 		break;
 	case 1:
-		if (!call->enabled &&
-		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+		if (!call->enabled) {
 			call->enabled = 1;
 			call->regfunc();
 		}
-		if (!call->raw_enabled &&
-		    (call->type & TRACE_EVENT_TYPE_RAW)) {
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
 		break;
 	}
 }
@@ -300,7 +290,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled || call->raw_enabled)
+	if (call->enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -346,107 +336,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-static ssize_t
-event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
-		loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	if (call->type & TRACE_EVENT_TYPE_PRINTF)
-		r += sprintf(buf, "printf\n");
-
-	if (call->type & TRACE_EVENT_TYPE_RAW)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
-		 loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[64];
-
-	/*
-	 * If there's only one type, we can't change it.
-	 * And currently we always have printf type, and we
-	 * may or may not have raw type.
-	 *
-	 * This is a redundant check, the file should be read
-	 * only if this is the case anyway.
-	 */
-
-	if (!call->raw_init)
-		return -EPERM;
-
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	if (!strncmp(buf, "printf", 6) &&
-	    (!buf[6] || isspace(buf[6]))) {
-
-		call->type = TRACE_EVENT_TYPE_PRINTF;
-
-		/*
-		 * If raw enabled, the disable it and enable
-		 * printf type.
-		 */
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-
-			call->enabled = 1;
-			call->regfunc();
-		}
-
-	} else if (!strncmp(buf, "raw", 3) &&
-	    (!buf[3] || isspace(buf[3]))) {
-
-		call->type = TRACE_EVENT_TYPE_RAW;
-
-		/*
-		 * If printf enabled, the disable it and enable
-		 * raw type.
-		 */
-		if (call->enabled) {
-			call->enabled = 0;
-			call->unregfunc();
-
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
-	} else
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static ssize_t
-event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
-			   loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	r += sprintf(buf, "printf\n");
-
-	if (call->raw_init)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
 #undef FIELD
 #define FIELD(type, name)						\
 	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
@@ -470,6 +359,7 @@ static int trace_write_header(struct trace_seq *s)
 				FIELD(int, pid),
 				FIELD(int, tgid));
 }
+
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -527,13 +417,6 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
-static const struct file_operations ftrace_avail_fops = {
-	.open = ftrace_event_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -548,17 +431,6 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
-static const struct file_operations ftrace_type_fops = {
-	.open = tracing_open_generic,
-	.read = event_type_read,
-	.write = event_type_write,
-};
-
-static const struct file_operations ftrace_available_types_fops = {
-	.open = tracing_open_generic,
-	.read = event_available_types_read,
-};
-
 static const struct file_operations ftrace_event_format_fops = {
 	.open = tracing_open_generic,
 	.read = event_format_read,
@@ -647,9 +519,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	/* default the output to printf */
-	call->type = TRACE_EVENT_TYPE_PRINTF;
-
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -665,21 +534,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
-	/* Only let type be writable, if we can change it */
-	entry = debugfs_create_file("type",
-				    call->raw_init ? 0644 : 0444,
-				    call->dir, call,
-				    &ftrace_type_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/type' entry\n", call->name);
-
-	entry = debugfs_create_file("available_types", 0444, call->dir, call,
-				    &ftrace_available_types_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/available_types' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
@@ -704,13 +558,6 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    (void *)&show_event_seq_ops,
-				    &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_events' entry\n");
-
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 3830a73..edfcbd3 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -18,19 +18,23 @@
 #define TRACE_FORMAT(call, proto, args, fmt)
 
 #undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
-	struct ftrace_raw_##name {					\
-		struct trace_entry	ent;				\
-		tstruct							\
-	};								\
-	static struct ftrace_event_call event_##name
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __field
+#define __field(type, item)		type	item;
 
-#define TRACE_FIELD(type, item, assign) \
-	type item;
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	type_item;
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 8e2e0f5..d91bf4c 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -32,23 +32,14 @@
  * in binary.
  */
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __entry
+#define __entry field
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
-	field->item,
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	field->item,
-
-
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...)	args
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -66,14 +57,76 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	ret = trace_seq_printf(s, print);				\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
 	return TRACE_TYPE_HANDLED;					\
 }
-
+	
 #include <trace/trace_event_types.h>
 
-#include "trace_format.h"
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry "REC"
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 41b82b9..8e398d8 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -144,27 +144,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TP_CMD
-#define TP_CMD(cmd...)	cmd
-
-#undef TRACE_ENTRY
-#define TRACE_ENTRY	entry
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
+	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	cmd;
+#undef __entry
+#define __entry entry
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -185,7 +173,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	tstruct;							\
+	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
 }									\
@@ -226,10 +214,8 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
-	.raw_reg		= ftrace_raw_reg_event_##call,		\
-	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e62bc10..23ae784 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,7 +15,28 @@
 
 #include "trace_output.h"
 
-#include "trace_format.h"
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-- 
cgit v1.1


From 157587d7ac555458da9f682e3250135e468470a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 00:15:34 -0400
Subject: tracing: remove obsolete TRACE_EVENT_FORMAT macro

Impact: clean up

The TRACE_EVENT_FORMAT macro is no longer used by trace points
and only the DECLARE_TRACE, TRACE_FORMAT or TRACE_EVENT macros should
be used by them. Although the TRACE_EVENT_FORMAT macro is still used
by the internal tracing utility, it should not be used in core
kernel code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_1.h | 3 ---
 kernel/trace/trace_events_stage_3.h | 6 +-----
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index edfcbd3..15e9bf9 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -17,9 +17,6 @@
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 8e398d8..3ba55d4 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -35,7 +35,7 @@
  * }
  *
  *
- * For those macros defined with TRACE_EVENT_FORMAT:
+ * For those macros defined with TRACE_EVENT:
  *
  * static struct ftrace_event_call event_<call>;
  *
@@ -144,10 +144,6 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
-	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v1.1


From bbcd3063597a3824357cd83c501c2a2aa21ef37b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 10 Mar 2009 10:49:53 +0900
Subject: tracing: Don't assume possible cpu list have continuous numbers

"for (++cpu ; cpu < num_possible_cpus(); cpu++)" statement assumes
possible cpus have continuous number - but that's a wrong assumption.

Insted, cpumask_next() should be used.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090310104437.A480.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 46c8dc8..739fdac 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -91,7 +91,7 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 	struct cpu_workqueue_stats *cws;
 	unsigned long flags;
 
-	WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+	WARN_ON(cpu < 0);
 
 	/* Workqueues are sometimes created in atomic context */
 	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
@@ -175,12 +175,12 @@ static void *workqueue_stat_next(void *prev, int idx)
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
 		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
-			ret = workqueue_stat_start_cpu(cpu);
-			if (ret)
-				return ret;
-		}
-		return NULL;
+		do {
+			cpu = cpumask_next(cpu, cpu_possible_mask);
+			if (cpu >= nr_cpu_ids)
+				return NULL;
+		} while (!(ret = workqueue_stat_start_cpu(cpu)));
+		return ret;
 	}
 	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
-- 
cgit v1.1


From bbb76b552a9cef86777181c8577acc907b122b41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Am=C3=A9rico=20Wang?= <xiyou.wangcong@gmail.com>
Date: Tue, 10 Mar 2009 17:34:47 +0800
Subject: ptrace: remove a useless goto

Impact: cleanup

Obviously, this goto is useless. Remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Roland McGrath <roland@redhat.com>
LKML-Reference: <20090310093447.GC3179@hack>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/ptrace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b..1379559 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,8 +612,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 		goto out_put_task_struct;
 
 	ret = arch_ptrace(child, request, addr, data);
-	if (ret < 0)
-		goto out_put_task_struct;
 
  out_put_task_struct:
 	put_task_struct(child);
-- 
cgit v1.1


From ce8eb2bf05042452107e489782105d2e235cbdd0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 10:14:35 -0400
Subject: tracing: fix printk format specifier

Impact: clean up

The offsetof and sizeof are of type size_t, and instead of typecasting
them to unsigned int for printk formatting, one could just use %zu.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1880a64..a0b41cc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -338,8 +338,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 #undef FIELD
 #define FIELD(type, name)						\
-	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
-		(unsigned int)sizeof(field.name)
+	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -347,11 +346,11 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned char, type),
 				FIELD(unsigned char, flags),
-- 
cgit v1.1


From 40e26815fafd3b8c4aced17b1f22e68ef33eb8db Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 11:32:40 -0400
Subject: tracing: do not allow modifying the ftrace events via the event files

Impact: fix to prevent crash on calling NULL function pointer

The ftrace internal records have their format exported via the event
system under the ftrace subsystem. These are only for exporting the
format to allow binary readers to be able to parse them in a binary
output.

The ftrace subsystem events can only be enabled via the ftrace tracers
and do not have a registering function. The event files expect the
event record to have registering function and will call it directly.
Passing in a ftrace subsystem event will cause the kernel to crash
because it will execute a NULL pointer.

This patch prevents the ftrace subsystem from being viewable to the
event enabling files.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a0b41cc..85ec10f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -102,7 +102,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	mutex_lock(&event_mutex);
 	events_for_each(call) {
 
-		if (!call->name)
+		if (!call->name || !call->regfunc)
 			continue;
 
 		if (match &&
@@ -207,8 +207,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
-		return NULL;
+	for (;;) {
+		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+			return NULL;
+
+		/*
+		 * The ftrace subsystem is for showing formats only.
+		 * They can not be enabled or disabled via the event files.
+		 */
+		if (call->regfunc)
+			break;
+
+		call++;
+		next = call;
+	}
 
 	m->private = ++next;
 
-- 
cgit v1.1


From 2314c4ae1461c9e8b26cf8b9a851f280bc5769e1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:04:02 -0400
Subject: tracing: add back the available_events file

The event directory files type and available_types were no longer
needed with the new TRACE_EVENT_FORMAT macros, they were deleted.
But by accident the available_events file was also removed.
This patch brings it back.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 85ec10f..769dfd0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -428,6 +428,13 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
+static const struct file_operations ftrace_avail_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -569,6 +576,13 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
+	entry = debugfs_create_file("available_events", 0444, d_tracer,
+				    (void *)&show_event_seq_ops,
+				    &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_events' entry\n");
+
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
-- 
cgit v1.1


From 30a8fecc2d34f086df34fe2f2b926f080e002600 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:41:38 -0400
Subject: tracing: flip the TP_printk and TP_fast_assign in the TRACE_EVENT
 macro

Impact: clean up

In trying to stay consistant with the C style format in the TRACE_EVENT
macro, it makes more sense to do the printk after the assigning of
the variables.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_1.h | 2 +-
 kernel/trace/trace_events_stage_2.h | 4 ++--
 kernel/trace/trace_events_stage_3.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 15e9bf9..82f6844 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -27,7 +27,7 @@
 #define TP_STRUCT__entry(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d91bf4c..1ad9f8d 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,7 +39,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -115,7 +115,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #define TP_fast_assign(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 3ba55d4..d6de06b 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -148,7 +148,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define __entry entry
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
-- 
cgit v1.1


From 0e3d0f0566f3fcf664782f597070bbc669d78454 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 13:12:58 -0400
Subject: tracing: update comments to match event code macros

Impact: clean up / comments

The comments that described the ftrace macros to manipulate the
TRACE_EVENT and TRACE_FORMAT macros no longer match the code.
This patch updates them.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_1.h | 6 ++++--
 kernel/trace/trace_events_stage_2.h | 9 ++++-----
 kernel/trace/trace_events_stage_3.h | 8 ++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 82f6844..38985f9 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -6,11 +6,13 @@
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
  *	<type>				<item>;
+ *	<type2>				<item2>[<len>];
  *	[...]
  * };
  *
- * The <type> <item> is created by the TRACE_FIELD(type, item, assign)
- * macro. We simply do "type item;", and that will create the fields
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
  * in the structure.
  */
 
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 1ad9f8d..ca347af 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_printk> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -76,10 +76,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
  *	int ret;
  *
  *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
  *
  * }
  */
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index d6de06b..6ee1de5 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -56,7 +56,8 @@
  * 		return;
  * 	entry	= ring_buffer_event_data(event);
  *
- * 	<tstruct>;  <-- Here we assign the entries by the TRACE_FIELD.
+ * 	<assign>;  <-- Here we assign the entries by the __field and
+ *			__array macros.
  *
  * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
@@ -96,11 +97,10 @@
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
  * 	.name 			= "<call>",
+ *	.system			= "<system>",
+ * 	.raw_init		= ftrace_raw_init_event_<call>,
  * 	.regfunc		= ftrace_reg_event_<call>,
  * 	.unregfunc		= ftrace_unreg_event_<call>,
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.raw_reg		= ftrace_raw_reg_event_<call>,
- * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
-- 
cgit v1.1


From ef18012b248b47ec9a12c3a83ca5e99782d39c5d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 14:10:56 -0400
Subject: tracing: remove funky whitespace in the trace code

Impact: clean up

There existed a lot of <space><tab>'s in the tracing code. This
patch removes them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c              | 10 ++--
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_events_stage_3.h  | 98 ++++++++++++++++++------------------
 kernel/trace/trace_export.c          |  2 +-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_output.c          | 14 +++---
 kernel/trace/trace_workqueue.c       |  6 +--
 8 files changed, 70 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e39679a..bec69d3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -33,7 +33,7 @@ static struct trace_array *blk_tr;
 static int __read_mostly  blk_tracer_enabled;
 
 /* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
+#define TRACE_BLK_OPT_CLASSIC	0x1
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:	the block device
- * @cmd: 	the ioctl cmd
+ * @cmd:	the ioctl cmd
  * @arg:	the argument data, if any
  *
  **/
@@ -1128,9 +1128,9 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static struct {
 	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
@@ -1229,7 +1229,7 @@ static struct tracer blk_tracer __read_mostly = {
 };
 
 static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
+	.type		= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f86..8c6a902 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -799,7 +799,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid               	= (tsk) ? tsk->tgid : 0;
+	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index aaa0755..ad8c22e 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -157,7 +157,7 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 
 
 static struct trace_event trace_branch_event = {
-	.type	 	= TRACE_BRANCH,
+	.type		= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 };
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6ee1de5..ae2e323 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,23 +5,23 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to  <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_event_<call>);
+ *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
  * For those macros defined with TRACE_FORMAT:
@@ -29,9 +29,9 @@
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.name			= "<call>",
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  * }
  *
  *
@@ -41,66 +41,66 @@
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
- * 	struct ring_buffer_event *event;
- * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- * 	unsigned long irq_flags;
- * 	int pc;
- *
- * 	local_save_flags(irq_flags);
- * 	pc = preempt_count();
- *
- * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
- * 				  sizeof(struct ftrace_raw_<call>),
- * 				  irq_flags, pc);
- * 	if (!event)
- * 		return;
- * 	entry	= ring_buffer_event_data(event);
- *
- * 	<assign>;  <-- Here we assign the entries by the __field and
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
  *			__array macros.
  *
- * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
- * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
  * static int ftrace_raw_init_event_<call>(void)
  * {
- * 	int id;
+ *	int id;
  *
- * 	id = register_ftrace_event(&ftrace_event_type_<call>);
- * 	if (!id)
- * 		return -ENODEV;
- * 	event_<call>.id = id;
- * 	return 0;
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
  * }
  *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
+ *	.name			= "<call>",
  *	.system			= "<system>",
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
@@ -138,7 +138,7 @@ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
@@ -163,7 +163,7 @@ static void ftrace_raw_event_##call(proto)				\
 	pc = preempt_count();						\
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call), 	\
+				  sizeof(struct ftrace_raw_##call),	\
 				  irq_flags, pc);			\
 	if (!event)							\
 		return;							\
@@ -208,7 +208,7 @@ static int ftrace_raw_init_event_##call(void)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 23ae784..4d9952d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -94,7 +94,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.show_format		= ftrace_format_##call,			\
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 453ebd3..d1493b8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -841,12 +841,12 @@ static void graph_trace_close(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     	= "function_graph",
+	.name		= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
 	.wait_pipe	= poll_wait_pipe,
-	.init	     	= graph_trace_init,
-	.reset	     	= graph_trace_reset,
+	.init		= graph_trace_init,
+	.reset		= graph_trace_reset,
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ef8fd66..491832a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -565,7 +565,7 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 }
 
 static struct trace_event trace_fn_event = {
-	.type	 	= TRACE_FN,
+	.type		= TRACE_FN,
 	.trace		= trace_fn_trace,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
@@ -696,7 +696,7 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_ctx_event = {
-	.type	 	= TRACE_CTX,
+	.type		= TRACE_CTX,
 	.trace		= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
@@ -704,7 +704,7 @@ static struct trace_event trace_ctx_event = {
 };
 
 static struct trace_event trace_wake_event = {
-	.type	 	= TRACE_WAKE,
+	.type		= TRACE_WAKE,
 	.trace		= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
@@ -759,7 +759,7 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_special_event = {
-	.type	 	= TRACE_SPECIAL,
+	.type		= TRACE_SPECIAL,
 	.trace		= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -796,7 +796,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_stack_event = {
-	.type	 	= TRACE_STACK,
+	.type		= TRACE_STACK,
 	.trace		= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -825,7 +825,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_user_stack_event = {
-	.type	 	= TRACE_USER_STACK,
+	.type		= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -879,7 +879,7 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 
 
 static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
+	.type		= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990..e542483 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -19,14 +19,14 @@ struct cpu_workqueue_stats {
 /* Useful to know if we print the cpu headers */
 	bool		            first_entry;
 	int		            cpu;
-	pid_t 			    pid;
+	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
-	atomic_t 	            inserted;
+	atomic_t	            inserted;
 /*
  *  Don't need to be atomic, works are serialized in a single workqueue thread
  *  on a single CPU.
  */
-	unsigned int 	 	    executed;
+	unsigned int		    executed;
 };
 
 /* List of workqueue threads on one cpu */
-- 
cgit v1.1


From 6cc3c6e12bb039047974ad2e7e2d46d15a1b762f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 10 Mar 2009 19:03:43 +0100
Subject: trace_clock: fix preemption bug

Using the function_graph tracer in recent kernels generates a spew of
preemption BUGs. Fix this by not requiring trace_clock_local() users
to disable preemption themselves.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_clock.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 2d4953f..05b176a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -27,12 +27,19 @@
  */
 u64 notrace trace_clock_local(void)
 {
+	unsigned long flags;
+	u64 clock;
+
 	/*
 	 * sched_clock() is an architecture implemented, fast, scalable,
 	 * lockless clock. It is not guaranteed to be coherent across
 	 * CPUs, nor across CPU idle events.
 	 */
-	return sched_clock();
+	raw_local_irq_save(flags);
+	clock = sched_clock();
+	raw_local_irq_restore(flags);
+
+	return clock;
 }
 
 /*
-- 
cgit v1.1


From 80370cb758e7ca2692cd9fb5e413d970b1f4b2b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 17:16:35 -0400
Subject: tracing: use raw spinlocks for trace_vprintk

Impact: prevent locking up by lockdep tracer

The lockdep tracer uses trace_vprintk and thus trace_vprintk can not
call back into lockdep without locking up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c6a902..4c97947 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1176,7 +1176,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
  */
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	static raw_spinlock_t trace_buf_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -1201,7 +1202,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	spin_lock_irqsave(&trace_buf_lock, flags);
+	/* Lockdep uses trace_printk for lock tracing */
+	local_irq_save(flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	if (len > TRACE_BUF_SIZE || len < 0)
@@ -1220,7 +1223,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	local_irq_restore(flags);
 
 out:
 	ftrace_preempt_enable(resched);
-- 
cgit v1.1


From 73c5162aa362a543793f4a957c6c536dcbaa89ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 13:42:01 -0400
Subject: tracing: keep ring buffer to minimum size till used

Impact: less memory impact on systems not using tracer

When the kernel boots up that has tracing configured, it allocates
the default size of the ring buffer. This currently happens to be
1.4Megs per possible CPU. This is quite a bit of wasted memory if
the system is never using the tracer.

The current solution is to keep the ring buffers to a minimum size
until the user uses them. Once a tracer is piped into the current_tracer
the ring buffer will be expanded to the default size. If the user
changes the size of the ring buffer, it will take the size given
by the user immediately.

If the user adds a "ftrace=" to the kernel command line, then the ring
buffers will be set to the default size on initialization.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 79 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4c97947..0c1dc18 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -45,6 +45,12 @@ unsigned long __read_mostly	tracing_max_latency;
 unsigned long __read_mostly	tracing_thresh;
 
 /*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+static int ring_buffer_expanded;
+
+/*
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
@@ -128,6 +134,8 @@ static int __init set_ftrace(char *str)
 {
 	strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
+	/* We are using ftrace early, expand it */
+	ring_buffer_expanded = 1;
 	return 1;
 }
 __setup("ftrace=", set_ftrace);
@@ -2315,6 +2323,40 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 	return t->init(tr);
 }
 
+static int tracing_resize_ring_buffer(unsigned long size)
+{
+	int ret;
+
+	/*
+	 * If kernel or user changes the size of the ring buffer
+	 * it get completed.
+	 */
+	ring_buffer_expanded = 1;
+
+	ret = ring_buffer_resize(global_trace.buffer, size);
+	if (ret < 0)
+		return ret;
+
+	ret = ring_buffer_resize(max_tr.buffer, size);
+	if (ret < 0) {
+		int r;
+
+		r = ring_buffer_resize(global_trace.buffer,
+				       global_trace.entries);
+		if (r < 0) {
+			/* AARGH! We are left with different
+			 * size max buffer!!!! */
+			WARN_ON(1);
+			tracing_disabled = 1;
+		}
+		return ret;
+	}
+
+	global_trace.entries = size;
+
+	return ret;
+}
+
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
@@ -2330,6 +2372,13 @@ static int tracing_set_tracer(const char *buf)
 	struct tracer *t;
 	int ret = 0;
 
+	if (!ring_buffer_expanded) {
+		ret = tracing_resize_ring_buffer(trace_buf_size);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
 	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(t->name, buf) == 0)
@@ -2903,28 +2952,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	val <<= 10;
 
 	if (val != global_trace.entries) {
-		ret = ring_buffer_resize(global_trace.buffer, val);
+		ret = tracing_resize_ring_buffer(val);
 		if (ret < 0) {
 			cnt = ret;
 			goto out;
 		}
-
-		ret = ring_buffer_resize(max_tr.buffer, val);
-		if (ret < 0) {
-			int r;
-			cnt = ret;
-			r = ring_buffer_resize(global_trace.buffer,
-					       global_trace.entries);
-			if (r < 0) {
-				/* AARGH! We are left with different
-				 * size max buffer!!!! */
-				WARN_ON(1);
-				tracing_disabled = 1;
-			}
-			goto out;
-		}
-
-		global_trace.entries = val;
 	}
 
 	filp->f_pos += cnt;
@@ -3916,6 +3948,7 @@ void ftrace_dump(void)
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
+	int ring_buf_size;
 	int i;
 	int ret = -ENOMEM;
 
@@ -3928,12 +3961,18 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
 		goto out_free_tracing_cpumask;
 
+	/* To save memory, keep the ring buffer size to its minimum */
+	if (ring_buffer_expanded)
+		ring_buf_size = trace_buf_size;
+	else
+		ring_buf_size = 1;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
 						   TRACE_BUFFER_FLAGS);
 	if (!global_trace.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3944,7 +3983,7 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+	max_tr.buffer = ring_buffer_alloc(ring_buf_size,
 					     TRACE_BUFFER_FLAGS);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
-- 
cgit v1.1


From 1852fcce181faa237c010a3dbedb473cf9d4555f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 14:33:00 -0400
Subject: tracing: expand the ring buffers when an event is activated

To save memory, the tracer ring buffers are set to a minimum.
The activating of a trace expands the ring buffer size. This patch
adds this expanding, when an event is activated.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c        | 20 ++++++++++++++++++++
 kernel/trace/trace.h        |  3 +++
 kernel/trace/trace_events.c |  8 ++++++++
 3 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0c1dc18..35ee63a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2357,6 +2357,26 @@ static int tracing_resize_ring_buffer(unsigned long size)
 	return ret;
 }
 
+/**
+ * tracing_update_buffers - used by tracing facility to expand ring buffers
+ *
+ * To save on memory when the tracing is never used on a system with it
+ * configured in. The ring buffers are set to a minimum size. But once
+ * a user starts to use the tracing facility, then they need to grow
+ * to their default size.
+ *
+ * This function is to be called when a tracer is about to be used.
+ */
+int tracing_update_buffers(void)
+{
+	int ret = 0;
+
+	if (!ring_buffer_expanded)
+		ret = tracing_resize_ring_buffer(trace_buf_size);
+
+	return ret;
+}
+
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d88..336324d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -737,6 +737,9 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+/* set ring buffers to default size if not already done so */
+int tracing_update_buffers(void);
+
 /* trace event type bit fields, not numeric */
 enum {
 	TRACE_EVENT_TYPE_PRINTF		= 1,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 769dfd0..ca624df 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -141,6 +141,10 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 	if (!cnt || cnt < 0)
 		return 0;
 
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
 	ret = get_user(ch, ubuf++);
 	if (ret)
 		return ret;
@@ -331,6 +335,10 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret < 0)
 		return ret;
 
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
 	switch (val) {
 	case 0:
 	case 1:
-- 
cgit v1.1


From 9aba60fe6eb20453de53a572143bef22fa929fba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 19:52:30 -0400
Subject: tracing: fix trace_wait to know to wait on all cpus or just one

Impact: fix to task live locking on reading trace_pipe on one CPU

The same code is used for both trace_pipe (all CPUS) and the per_cpu
trace_pipe file. When there is no data to read, it will check for
signals and wait on the trace wait queue.

The problem happens with the per_cpu wait. The trace_wait code checks
all CPUs. Thus, if there's data in another CPU buffer, then it will
exit the wait, without checking for signals or waiting on the wait queue.

It would then try to read the empty buffer, and since that will just
return nothing, then it will try to wait again. Unfortunately, that will
again fail due to there still being data in the other buffers. This
ends up with a live lock for the task.

This patch fixes the trace_wait to be aware that the iterator may only
be waiting on a single buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 35ee63a..e60f4be 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1666,6 +1666,19 @@ static int trace_empty(struct trace_iterator *iter)
 {
 	int cpu;
 
+	/* If we are looking at one CPU buffer, only check that one */
+	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+		cpu = iter->cpu_file;
+		if (iter->buffer_iter[cpu]) {
+			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+				return 0;
+		} else {
+			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+				return 0;
+		}
+		return 1;
+	}
+
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu]) {
 			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
-- 
cgit v1.1


From 554f786e284a6ce859d51f62240d615603944c8e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 22:00:13 -0400
Subject: ring-buffer: only allocate buffers for online cpus

Impact: save on memory

Currently, a ring buffer was allocated for each "possible_cpus". On
some systems, this is the same as NR_CPUS. Thus, if a system defined
NR_CPUS = 64 but it only had 1 CPU, we could have possibly 63 useless
ring buffers taking up space. With a default buffer of 3 megs, this
could be quite drastic.

This patch changes the ring buffer code to only allocate ring buffers
for online CPUs.  If a CPU goes off line, we do not free the buffer.
This is because the user may still have trace data in that buffer
that they would like to look at.

Perhaps in the future we could add code to delete a ring buffer if
the CPU is offline and the ring buffer becomes empty.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.c       |   6 -
 2 files changed, 216 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1788584..d07c288 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
+#include <linux/cpu.h>
 #include <linux/fs.h>
 
 #include "trace.h"
@@ -301,6 +302,10 @@ struct ring_buffer {
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
+
+#ifdef CONFIG_HOTPLUG
+	struct notifier_block		cpu_notify;
+#endif
 };
 
 struct ring_buffer_iter {
@@ -459,6 +464,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
  */
 extern int ring_buffer_page_too_big(void);
 
+#ifdef CONFIG_HOTPLUG
+static int __cpuinit rb_cpu_notify(struct notifier_block *self,
+				   unsigned long action, void *hcpu);
+#endif
+
 /**
  * ring_buffer_alloc - allocate a new ring_buffer
  * @size: the size in bytes per cpu that is needed.
@@ -496,7 +506,8 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
-	cpumask_copy(buffer->cpumask, cpu_possible_mask);
+	get_online_cpus();
+	cpumask_copy(buffer->cpumask, cpu_online_mask);
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
@@ -512,6 +523,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 			goto fail_free_buffers;
 	}
 
+#ifdef CONFIG_HOTPLUG
+	buffer->cpu_notify.notifier_call = rb_cpu_notify;
+	buffer->cpu_notify.priority = 0;
+	register_cpu_notifier(&buffer->cpu_notify);
+#endif
+
+	put_online_cpus();
 	mutex_init(&buffer->mutex);
 
 	return buffer;
@@ -525,6 +543,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
  fail_free_cpumask:
 	free_cpumask_var(buffer->cpumask);
+	put_online_cpus();
 
  fail_free_buffer:
 	kfree(buffer);
@@ -541,9 +560,17 @@ ring_buffer_free(struct ring_buffer *buffer)
 {
 	int cpu;
 
+	get_online_cpus();
+
+#ifdef CONFIG_HOTPLUG
+	unregister_cpu_notifier(&buffer->cpu_notify);
+#endif
+
 	for_each_buffer_cpu(buffer, cpu)
 		rb_free_cpu_buffer(buffer->buffers[cpu]);
 
+	put_online_cpus();
+
 	free_cpumask_var(buffer->cpumask);
 
 	kfree(buffer);
@@ -649,16 +676,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		return size;
 
 	mutex_lock(&buffer->mutex);
+	get_online_cpus();
 
 	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
-			mutex_unlock(&buffer->mutex);
-			return -1;
-		}
+		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
+			goto out_fail;
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -677,10 +703,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
-		mutex_unlock(&buffer->mutex);
-		return -1;
-	}
+	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
+		goto out_fail;
 
 	new_pages = nr_pages - buffer->pages;
 
@@ -705,13 +729,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	if (RB_WARN_ON(buffer, !list_empty(&pages))) {
-		mutex_unlock(&buffer->mutex);
-		return -1;
-	}
+	if (RB_WARN_ON(buffer, !list_empty(&pages)))
+		goto out_fail;
 
  out:
 	buffer->pages = nr_pages;
+	put_online_cpus();
 	mutex_unlock(&buffer->mutex);
 
 	return size;
@@ -721,8 +744,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		list_del_init(&bpage->list);
 		free_buffer_page(bpage);
 	}
+	put_online_cpus();
 	mutex_unlock(&buffer->mutex);
 	return -ENOMEM;
+
+	/*
+	 * Something went totally wrong, and we are too paranoid
+	 * to even clean up the mess.
+	 */
+ out_fail:
+	put_online_cpus();
+	mutex_unlock(&buffer->mutex);
+	return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
@@ -1528,11 +1561,15 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
+ out:
+	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
@@ -1548,11 +1585,15 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
+ out:
+	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
@@ -1564,12 +1605,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret = 0;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 0;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return cpu_buffer->entries;
+	ret = cpu_buffer->entries;
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 
@@ -1581,12 +1629,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret = 0;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 0;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return cpu_buffer->overrun;
+	ret = cpu_buffer->overrun;
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
@@ -1603,12 +1658,16 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	unsigned long entries = 0;
 	int cpu;
 
+	get_online_cpus();
+
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		entries += cpu_buffer->entries;
 	}
 
+	put_online_cpus();
+
 	return entries;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
@@ -1626,12 +1685,16 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	unsigned long overruns = 0;
 	int cpu;
 
+	get_online_cpus();
+
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		overruns += cpu_buffer->overrun;
 	}
 
+	put_online_cpus();
+
 	return overruns;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overruns);
@@ -1663,9 +1726,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
  */
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags;
 
+	if (!iter)
+		return;
+
+	cpu_buffer = iter->cpu_buffer;
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	rb_iter_reset(iter);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1900,9 +1968,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct buffer_page *reader;
 	int nr_loops = 0;
 
-	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
-
 	cpu_buffer = buffer->buffers[cpu];
 
  again:
@@ -2028,13 +2093,21 @@ struct ring_buffer_event *
 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event;
+	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+	get_online_cpus();
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		goto out;
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return event;
 }
 
@@ -2071,24 +2144,31 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 struct ring_buffer_event *
 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+	/* might be called in atomic */
+	preempt_disable();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
+		goto out;
 
+	cpu_buffer = buffer->buffers[cpu];
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	event = rb_buffer_peek(buffer, cpu, ts);
 	if (!event)
-		goto out;
+		goto out_unlock;
 
 	rb_advance_reader(cpu_buffer);
 
- out:
+ out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	preempt_enable();
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2109,15 +2189,17 @@ struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_iter *iter;
+	struct ring_buffer_iter *iter = NULL;
 	unsigned long flags;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
+		goto out;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
-		return NULL;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 
@@ -2132,6 +2214,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2224,9 +2309,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
+	int resched;
+
+	/* Can't use get_online_cpus because this can be in atomic */
+	resched = ftrace_preempt_disable();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -2237,6 +2326,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ out:
+	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2246,10 +2337,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
+	int resched;
 	int cpu;
 
+	/* Can't use get_online_cpus because this can be in atomic */
+	resched = ftrace_preempt_disable();
+
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
+
+	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
@@ -2262,12 +2359,17 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
+	get_online_cpus();
+
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		if (!rb_per_cpu_empty(cpu_buffer))
 			return 0;
 	}
+
+	put_online_cpus();
+
 	return 1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2280,12 +2382,20 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	int ret = 1;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 1;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return rb_per_cpu_empty(cpu_buffer);
+	ret = rb_per_cpu_empty(cpu_buffer);
+
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
 
@@ -2304,32 +2414,37 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 {
 	struct ring_buffer_per_cpu *cpu_buffer_a;
 	struct ring_buffer_per_cpu *cpu_buffer_b;
+	int ret = -EINVAL;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
-		return -EINVAL;
+		goto out;
 
 	/* At least make sure the two buffers are somewhat the same */
 	if (buffer_a->pages != buffer_b->pages)
-		return -EINVAL;
+		goto out;
+
+	ret = -EAGAIN;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&buffer_a->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&buffer_b->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
 	if (atomic_read(&cpu_buffer_a->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&cpu_buffer_b->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	/*
 	 * We can't do a synchronize_sched here because this
@@ -2349,7 +2464,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	atomic_dec(&cpu_buffer_a->record_disabled);
 	atomic_dec(&cpu_buffer_b->record_disabled);
 
-	return 0;
+	ret = 0;
+out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
@@ -2464,27 +2583,32 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	u64 save_timestamp;
 	int ret = -1;
 
+	get_online_cpus();
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		goto out;
+
 	/*
 	 * If len is not big enough to hold the page header, then
 	 * we can not copy anything.
 	 */
 	if (len <= BUF_PAGE_HDR_SIZE)
-		return -1;
+		goto out;
 
 	len -= BUF_PAGE_HDR_SIZE;
 
 	if (!data_page)
-		return -1;
+		goto out;
 
 	bpage = *data_page;
 	if (!bpage)
-		return -1;
+		goto out;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
-		goto out;
+		goto out_unlock;
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -2506,7 +2630,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		unsigned int size;
 
 		if (full)
-			goto out;
+			goto out_unlock;
 
 		if (len > (commit - read))
 			len = (commit - read);
@@ -2514,7 +2638,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		size = rb_event_length(event);
 
 		if (len < size)
-			goto out;
+			goto out_unlock;
 
 		/* save the current timestamp, since the user will need it */
 		save_timestamp = cpu_buffer->read_stamp;
@@ -2553,9 +2677,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	}
 	ret = read;
 
- out:
+ out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return ret;
 }
 
@@ -2629,3 +2756,42 @@ static __init int rb_init_debugfs(void)
 }
 
 fs_initcall(rb_init_debugfs);
+
+#ifdef CONFIG_HOTPLUG
+static int __cpuinit rb_cpu_notify(struct notifier_block *self,
+				   unsigned long action, void *hcpu)
+{
+	struct ring_buffer *buffer =
+		container_of(self, struct ring_buffer, cpu_notify);
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (cpu_isset(cpu, *buffer->cpumask))
+			return NOTIFY_OK;
+
+		buffer->buffers[cpu] =
+			rb_allocate_cpu_buffer(buffer, cpu);
+		if (!buffer->buffers[cpu]) {
+			WARN(1, "failed to allocate ring buffer on CPU %ld\n",
+			     cpu);
+			return NOTIFY_OK;
+		}
+		smp_wmb();
+		cpu_set(cpu, *buffer->cpumask);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		/*
+		 * Do nothing.
+		 *  If we were to free the buffer, then the user would
+		 *  lose any trace that was in the buffer.
+		 */
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+#endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e60f4be..14c98f6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1805,17 +1805,11 @@ __tracing_open(struct inode *inode, struct file *file)
 
 			iter->buffer_iter[cpu] =
 				ring_buffer_read_start(iter->tr->buffer, cpu);
-
-			if (!iter->buffer_iter[cpu])
-				goto fail_buffer;
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
 				ring_buffer_read_start(iter->tr->buffer, cpu);
-
-		if (!iter->buffer_iter[cpu])
-			goto fail;
 	}
 
 	/* TODO stop tracer */
-- 
cgit v1.1


From d820ac4c2fa881079e6b689d2098adce337558ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 01:30:40 +0100
Subject: locking: rename trace_softirq_[enter|exit] =>
 lockdep_softirq_[enter|exit]

Impact: cleanup

The naming clashes with upcoming softirq tracepoints, so rename the
APIs to lockdep_*().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7..08a030f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
+	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
 restart:
@@ -220,7 +220,7 @@ restart:
 	if (pending)
 		wakeup_softirqd();
 
-	trace_softirq_exit();
+	lockdep_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
-- 
cgit v1.1


From a123c52b46a1f84bcec3dc963351896c6d6afaf7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:21:08 -0400
Subject: tracing: fix comments about trace buffer resizing

Impact: cleanup

Some of the comments about the trace buffer resizing is gobbledygook.
And I wonder why people question if I'm a native English speaker.

This patch makes the comments make a bit more sense.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c3946a6..c61ee85 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2336,7 +2336,8 @@ static int tracing_resize_ring_buffer(unsigned long size)
 
 	/*
 	 * If kernel or user changes the size of the ring buffer
-	 * it get completed.
+	 * we use the size that was given, and we can forget about
+	 * expanding it later.
 	 */
 	ring_buffer_expanded = 1;
 
@@ -2351,8 +2352,20 @@ static int tracing_resize_ring_buffer(unsigned long size)
 		r = ring_buffer_resize(global_trace.buffer,
 				       global_trace.entries);
 		if (r < 0) {
-			/* AARGH! We are left with different
-			 * size max buffer!!!! */
+			/*
+			 * AARGH! We are left with different
+			 * size max buffer!!!!
+			 * The max buffer is our "snapshot" buffer.
+			 * When a tracer needs a snapshot (one of the
+			 * latency tracers), it swaps the max buffer
+			 * with the saved snap shot. We succeeded to
+			 * update the size of the main buffer, but failed to
+			 * update the size of the max buffer. But when we tried
+			 * to reset the main buffer to the original size, we
+			 * failed there too. This is very unlikely to
+			 * happen, but if it does, warn and kill all
+			 * tracing.
+			 */
 			WARN_ON(1);
 			tracing_disabled = 1;
 		}
-- 
cgit v1.1


From 1027fcb206a0fb8348e63aff078c74bdee1c2698 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:33:20 -0400
Subject: tracing: protect ring_buffer_expanded with trace_types_lock

Impact: prevent races with ring_buffer_expanded

This patch places the expanding of the tracing buffer under the
protection of the trace_types_lock mutex. It is highly unlikely
that there would be any contention, but better safe than sorry.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c61ee85..04ab824 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2391,8 +2391,10 @@ int tracing_update_buffers(void)
 {
 	int ret = 0;
 
+	mutex_lock(&trace_types_lock);
 	if (!ring_buffer_expanded)
 		ret = tracing_resize_ring_buffer(trace_buf_size);
+	mutex_unlock(&trace_types_lock);
 
 	return ret;
 }
@@ -2412,6 +2414,8 @@ static int tracing_set_tracer(const char *buf)
 	struct tracer *t;
 	int ret = 0;
 
+	mutex_lock(&trace_types_lock);
+
 	if (!ring_buffer_expanded) {
 		ret = tracing_resize_ring_buffer(trace_buf_size);
 		if (ret < 0)
@@ -2419,7 +2423,6 @@ static int tracing_set_tracer(const char *buf)
 		ret = 0;
 	}
 
-	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(t->name, buf) == 0)
 			break;
-- 
cgit v1.1


From 59222efe2d184956464abe5b637bc842ff053b93 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:46:03 -0400
Subject: ring-buffer: use CONFIG_HOTPLUG_CPU not CONFIG_HOTPLUG

The hotplug code in the ring buffers is for use with CPU hotplug,
not generic hotplug.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d07c288..035b56c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -303,7 +303,7 @@ struct ring_buffer {
 
 	struct ring_buffer_per_cpu	**buffers;
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
 };
@@ -464,7 +464,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
  */
 extern int ring_buffer_page_too_big(void);
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit rb_cpu_notify(struct notifier_block *self,
 				   unsigned long action, void *hcpu);
 #endif
@@ -523,7 +523,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 			goto fail_free_buffers;
 	}
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	buffer->cpu_notify.notifier_call = rb_cpu_notify;
 	buffer->cpu_notify.priority = 0;
 	register_cpu_notifier(&buffer->cpu_notify);
@@ -562,7 +562,7 @@ ring_buffer_free(struct ring_buffer *buffer)
 
 	get_online_cpus();
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	unregister_cpu_notifier(&buffer->cpu_notify);
 #endif
 
@@ -2757,7 +2757,7 @@ static __init int rb_init_debugfs(void)
 
 fs_initcall(rb_init_debugfs);
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit rb_cpu_notify(struct notifier_block *self,
 				   unsigned long action, void *hcpu)
 {
-- 
cgit v1.1


From 8aabee573dff131a085c63de7667eacd94ba4ccb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 13:13:49 -0400
Subject: ring-buffer: remove unneeded get_online_cpus

Impact: speed up and remove possible races

The get_online_cpus was added to the ring buffer because the original
design would free the ring buffer on a CPU that was being taken
off line. The final design kept the ring buffer around even when the
CPU was taken off line. This is to allow a user to still read the
information on that ring buffer.

Most of the get_online_cpus are no longer needed since the ring buffer will
not disappear from the use cases.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 90 ++++++++--------------------------------------
 1 file changed, 14 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 035b56c..2c36be9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1561,15 +1561,11 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
- out:
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
@@ -1585,15 +1581,11 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
- out:
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
@@ -1605,17 +1597,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret = 0;
-
-	get_online_cpus();
+	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = cpu_buffer->entries;
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -1629,17 +1617,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret = 0;
-
-	get_online_cpus();
+	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = cpu_buffer->overrun;
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -1658,16 +1642,12 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	unsigned long entries = 0;
 	int cpu;
 
-	get_online_cpus();
-
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		entries += cpu_buffer->entries;
 	}
 
-	put_online_cpus();
-
 	return entries;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
@@ -1685,16 +1665,12 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	unsigned long overruns = 0;
 	int cpu;
 
-	get_online_cpus();
-
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		overruns += cpu_buffer->overrun;
 	}
 
-	put_online_cpus();
-
 	return overruns;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overruns);
@@ -2093,21 +2069,16 @@ struct ring_buffer_event *
 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event = NULL;
+	struct ring_buffer_event *event;
 	unsigned long flags;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return NULL;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
- out:
-	put_online_cpus();
-
 	return event;
 }
 
@@ -2189,17 +2160,15 @@ struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_iter *iter = NULL;
+	struct ring_buffer_iter *iter;
 	unsigned long flags;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return NULL;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
-		goto out;
+		return NULL;
 
 	cpu_buffer = buffer->buffers[cpu];
 
@@ -2214,9 +2183,6 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
- out:
-	put_online_cpus();
-
 	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2309,13 +2275,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
-	int resched;
-
-	/* Can't use get_online_cpus because this can be in atomic */
-	resched = ftrace_preempt_disable();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -2326,8 +2288,6 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- out:
-	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2337,16 +2297,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
-	int resched;
 	int cpu;
 
-	/* Can't use get_online_cpus because this can be in atomic */
-	resched = ftrace_preempt_disable();
-
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
-
-	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
@@ -2359,8 +2313,6 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
-	get_online_cpus();
-
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
@@ -2368,8 +2320,6 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 			return 0;
 	}
 
-	put_online_cpus();
-
 	return 1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2382,18 +2332,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	int ret = 1;
-
-	get_online_cpus();
+	int ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 1;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = rb_per_cpu_empty(cpu_buffer);
 
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -2416,8 +2362,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	struct ring_buffer_per_cpu *cpu_buffer_b;
 	int ret = -EINVAL;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
 		goto out;
@@ -2466,8 +2410,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 
 	ret = 0;
 out:
-	put_online_cpus();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -2583,8 +2525,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	u64 save_timestamp;
 	int ret = -1;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		goto out;
 
@@ -2681,8 +2621,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
  out:
-	put_online_cpus();
-
 	return ret;
 }
 
-- 
cgit v1.1


From db526ca329f855510e8ce672332eba3304aed590 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 13:53:25 -0400
Subject: tracing: show that buffer size is not expanded

Impact: do not confuse user on small trace buffer sizes

When the system boots up, the trace buffer is small to conserve memory.
It is only two pages per online CPU. When the tracer is used, it expands
to the default value.

This can confuse the user if they look at the buffer size and see only
7, but then later they see 1408.

 # cat /debug/tracing/buffer_size_kb
7

 # echo sched_switch > /debug/tracing/current_tracer

 # cat /debug/tracing/buffer_size_kb
1408

This patch tries to help remove this confustion by showing that the
buffer has not been expanded.

 # cat /debug/tracing/buffer_size_kb
7 (expanded: 1408)

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 04ab824..62a63b2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2948,10 +2948,18 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	char buf[64];
+	char buf[96];
 	int r;
 
-	r = sprintf(buf, "%lu\n", tr->entries >> 10);
+	mutex_lock(&trace_types_lock);
+	if (!ring_buffer_expanded)
+		r = sprintf(buf, "%lu (expanded: %lu)\n",
+			    tr->entries >> 10,
+			    trace_buf_size >> 10);
+	else
+		r = sprintf(buf, "%lu\n", tr->entries >> 10);
+	mutex_unlock(&trace_types_lock);
+
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-- 
cgit v1.1


From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 12 Mar 2009 18:24:49 +0100
Subject: tracing/core: bring back raw trace_printk for dynamic formats strings

Impact: fix callsites with dynamic format strings

Since its new binary implementation, trace_printk() internally uses static
containers for the format strings on each callsites. But the value is
assigned once at build time, which means that it can't take dynamic
formats.

So this patch unearthes the raw trace_printk implementation for the callers
that will need trace_printk to be able to carry these dynamic format
strings. The trace_printk() macro will use the appropriate implementation
for each callsite. Most of the time however, the binary implementation will
still be used.

The other impact of this patch is that mmiotrace_printk() will use the old
implementation because it calls the low level trace_vprintk and we can't
guess here whether the format passed in it is dynamic or not.

Some parts of this patch have been written by Steven Rostedt (most notably
the part that chooses the appropriate implementation for each callsites).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c                 | 85 +++++++++++++++++++++++++++++++++---
 kernel/trace/trace.h                 | 13 +++++-
 kernel/trace/trace_event_types.h     | 11 ++++-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_mmiotrace.c       |  7 +--
 kernel/trace/trace_output.c          | 57 +++++++++++++++++++++---
 kernel/trace/trace_printk.c          | 33 +++++++++++---
 7 files changed, 186 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62a63b2..dbb077d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 
 /**
- * trace_vprintk - write binary msg to tracing buffer
+ * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
+	struct bprint_entry *entry;
 	unsigned long flags;
 	int resched;
 	int cpu, len = 0, size, pc;
@@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 
 	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -1240,6 +1240,60 @@ out:
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+	static char trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+
+	if (tracing_disabled || tracing_selftest_running)
+		return 0;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	pause_graph_tracing();
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
+	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	len = min(len, TRACE_BUF_SIZE-1);
+	trace_buf[len] = 0;
+
+	size = sizeof(*entry) + len + 1;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+
+	memcpy(&entry->buf, trace_buf, len);
+	entry->buf[len] = 0;
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+ out_unlock:
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
+	unpause_graph_tracing();
+ out:
+	preempt_enable_notrace();
+
+	return len;
+}
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
 enum trace_file_type {
@@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_bprintk_msg_only(iter);
+
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 336324d..cede1ab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINT,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -117,7 +118,7 @@ struct userstack_entry {
 /*
  * trace_printk entry:
  */
-struct print_entry {
+struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	int			depth;
@@ -125,6 +126,13 @@ struct print_entry {
 	u32			buf[];
 };
 
+struct print_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	int			depth;
+	char			buf[];
+};
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
@@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
+trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5cca4c9..d0907d7 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
@@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8566c14..4c38860 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
@@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
+	case TRACE_BPRINT: {
+		struct bprint_entry *field;
 		trace_assign_type(field, entry);
 		return print_graph_comment(field, s, entry, iter);
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 23e346a..f095916 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
+	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
 	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
@@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = trace_seq_bprintf(s, print->fmt, print->buf);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 491832a..ea9d3b4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = {
 	.binary		= trace_special_bin,
 };
 
-/* TRACE_PRINT */
+/* TRACE_BPRINT */
 static enum print_line_t
-trace_print_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct print_entry *field;
+	struct bprint_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags)
 }
 
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags)
 {
-	struct print_entry *field;
+	struct bprint_entry *field;
 	struct trace_seq *s = &iter->seq;
 
 	trace_assign_type(field, iter->ent);
@@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 }
 
 
+static struct trace_event trace_bprint_event = {
+	.type		= TRACE_BPRINT,
+	.trace		= trace_bprint_print,
+	.raw		= trace_bprint_raw,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
+{
+	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
 static struct trace_event trace_print_event = {
-	.type		= TRACE_PRINT,
+	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bprint_event,
 	&trace_print_event,
 	NULL
 };
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a50aea2..f307a11e 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
 };
 
-int __trace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
  {
 	int ret;
 	va_list ap;
@@ -111,13 +111,13 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__trace_printk);
+EXPORT_SYMBOL_GPL(__trace_bprintk);
 
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
  {
 	if (unlikely(!fmt))
 		return 0;
@@ -125,11 +125,34 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
+	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
 	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.1


From 828275574e0161bdddb5817d4bd76a0265ef0470 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:14:31 -0400
Subject: tracing: make bprint event use the proper event id

The bprint record is using TRACE_PRINT when it should be TRACE_BPRINT.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_event_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d0907d7..0199150 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
-- 
cgit v1.1


From e9fb2b6d5845e24f104713591286b6f39761c027 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:19:25 -0400
Subject: tracing: have event_trace_printk use static tracer

Impact: speed up on event tracing

The event_trace_printk is currently a wrapper function that calls
trace_vprintk. Because it uses a variable for the fmt it misses out
on the optimization of using the binary printk.

This patch makes event_trace_printk into a macro wrapper to use the
fmt as the same as the trace_printks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h        | 17 +++++++++++++++++
 kernel/trace/trace_events.c | 10 ----------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cede1ab..35cfa7b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -773,4 +773,21 @@ void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+extern const char *__start___trace_bprintk_fmt[];
+extern const char *__stop___trace_bprintk_fmt[];
+
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ca624df..238ea95 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,16 +24,6 @@ static DEFINE_MUTEX(event_mutex);
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-void event_trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	tracing_record_cmdline(current);
-	trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-}
-
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-- 
cgit v1.1


From 7975a2be16dd42df2cab80c80cb6ece382edb6ec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:23:17 -0400
Subject: tracing: export trace formats to user space

The binary printk saves a pointer to the format string in the ring buffer.
On output, the format is processed. But if the user is reading the
ring buffer through a binary interface, the pointer is meaningless.

This patch creates a file called printk_formats that maps the pointers
to the formats.

 # cat /debug/tracing/printk_formats
0xffffffff80713d40 : "irq_handler_entry: irq=%d handler=%s\n"
0xffffffff80713d48 : "lock_acquire: %s%s%s\n"
0xffffffff80713d50 : "lock_release: %s\n"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_printk.c | 119 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f307a11e..4867852 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -4,18 +4,19 @@
  * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  */
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/string.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
 #include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
 
 #include "trace.h"
 
@@ -153,6 +154,114 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	const char **fmt = m->private;
+	const char **next = fmt;
+
+	(*pos)++;
+
+	if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
+		return NULL;
+
+	next = fmt;
+	m->private = ++next;
+
+	return fmt;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	return t_next(m, NULL, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	const char **fmt = v;
+	const char *str = *fmt;
+	int i;
+
+	seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+
+	/*
+	 * Tabs and new lines need to be converted.
+	 */
+	for (i = 0; str[i]; i++) {
+		switch (str[i]) {
+		case '\n':
+			seq_puts(m, "\\n");
+			break;
+		case '\t':
+			seq_puts(m, "\\t");
+			break;
+		case '\\':
+			seq_puts(m, "\\");
+			break;
+		case '"':
+			seq_puts(m, "\\\"");
+			break;
+		default:
+			seq_putc(m, str[i]);
+		}
+	}
+	seq_puts(m, "\"\n");
+
+	return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations show_format_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static int
+ftrace_formats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &show_format_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = __start___trace_bprintk_fmt;
+	}
+	return ret;
+}
+
+static const struct file_operations ftrace_formats_fops = {
+	.open = ftrace_formats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int init_trace_printk_function_export(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+				    NULL, &ftrace_formats_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'printk_formats' entry\n");
+
+	return 0;
+}
+
+fs_initcall(init_trace_printk_function_export);
+
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.1


From 2da03ecee6308ea174e8a02b92a3c4ec92e886c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 18:57:51 -0400
Subject: tracing: fix stack tracer header

The stack tracer use to look like this:

 # cat /debug/tracing/stack_trace
         Depth  Size      Location    (57 entries)
         -----  ----      --------
  0)     5088      16   mempool_alloc_slab+0x16/0x18
  1)     5072     144   mempool_alloc+0x4d/0xfe
  2)     4928      16   scsi_sg_alloc+0x48/0x4a [scsi_mod]

Now it looks like this:

 # cat /debug/tracing/stack_trace

        Depth    Size      Location    (57 entries)
        -----    ----      --------
  0)     5088      16   mempool_alloc_slab+0x16/0x18
  1)     5072     144   mempool_alloc+0x4d/0xfe
  2)     4928      16   scsi_sg_alloc+0x48/0x4a [scsi_mod]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc..4564fd9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -251,9 +251,9 @@ static int t_show(struct seq_file *m, void *v)
 	int size;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(m, "        Depth   Size      Location"
+		seq_printf(m, "        Depth    Size      Location"
 			   "    (%d entries)\n"
-			   "        -----   ----      --------\n",
+			   "        -----    ----      --------\n",
 			   max_stack_trace.nr_entries);
 		return 0;
 	}
-- 
cgit v1.1


From e447e1df2e568cd43d1918963c9f09fae85aea57 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 19:42:29 -0400
Subject: tracing: explain why stack tracer is empty

If the stack tracing is disabled (by default) the stack_trace file
will only contain the header:

 # cat /debug/tracing/stack_trace
        Depth    Size      Location    (0 entries)
        -----    ----      --------

This can be frustrating to a developer that does not realize that the
stack tracer is disabled. This patch adds the following text:

  # cat /debug/tracing/stack_trace
        Depth    Size      Location    (0 entries)
        -----    ----      --------
 #
 #  Stack tracer disabled
 #
 # To enable the stack tracer, either add 'stacktrace' to the
 # kernel command line
 # or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'
 #

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4564fd9..91ccbf3 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -245,6 +245,17 @@ static int trace_lookup_stack(struct seq_file *m, long i)
 #endif
 }
 
+static void print_disabled(struct seq_file *m)
+{
+	seq_puts(m, "#\n"
+		 "#  Stack tracer disabled\n"
+		 "#\n"
+		 "# To enable the stack tracer, either add 'stacktrace' to the\n"
+		 "# kernel command line\n"
+		 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
+		 "#\n");
+}
+
 static int t_show(struct seq_file *m, void *v)
 {
 	long i;
@@ -255,6 +266,10 @@ static int t_show(struct seq_file *m, void *v)
 			   "    (%d entries)\n"
 			   "        -----    ----      --------\n",
 			   max_stack_trace.nr_entries);
+
+		if (!stack_tracer_enabled && !max_stack_size)
+			print_disabled(m);
+
 		return 0;
 	}
 
-- 
cgit v1.1


From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:33:36 -0400
Subject: tracing: tracepoints for softirq entry/exit - add softirq-to-name
 array

Create a 'softirq_to_name' array, which is indexed by softirq #, so
that we can easily convert between the softirq index # and its name, in
order to get more meaningful output messages.

LKML-Reference: <20090312183336.GB3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/softirq.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7571bcb..9f90fdc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+char *softirq_to_name[NR_SOFTIRQS] = {
+	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
+	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
+	"RCU_SOFTIRQ"
+};
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -209,9 +215,10 @@ restart:
 			h->action(h);
 
 			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %p"
+				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", h - softirq_vec,
+				       softirq_to_name[h - softirq_vec],
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
-- 
cgit v1.1


From 39842323ceb368d2ea36ab7696aedbe296e13b61 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:36:03 -0400
Subject: tracing: tracepoints for softirq entry/exit - tracepoints

Introduce softirq entry/exit tracepoints. These are useful for
augmenting existing tracers, and to figure out softirq frequencies and
timings.

[
  s/irq_softirq_/softirq_/ for trace point names and
  Fixed printf format in TRACE_FORMAT macro
   - Steven Rostedt
]

LKML-Reference: <20090312183603.GC3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/softirq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9f90fdc..a5e8123 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
+#include <trace/irq.h>
 
 #include <asm/irq.h>
 /*
@@ -186,6 +187,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
 #define MAX_SOFTIRQ_RESTART 10
 
+DEFINE_TRACE(softirq_entry);
+DEFINE_TRACE(softirq_exit);
+
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
@@ -212,8 +216,9 @@ restart:
 		if (pending & 1) {
 			int prev_count = preempt_count();
 
+			trace_softirq_entry(h, softirq_vec);
 			h->action(h);
-
+			trace_softirq_exit(h, softirq_vec);
 			if (unlikely(prev_count != preempt_count())) {
 				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
-- 
cgit v1.1


From 889a6c367283709a80dad9413488472596a1a1d2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 13 Mar 2009 09:03:04 +0900
Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer fix

commit c3ffc7a40b7e94b094efe1c8ab4e24370a782b65
"Don't use tracing_record_cmdline() in workqueue tracer"
has a race window.

find_task_by_vpid() requires task_list_lock().

LKML-Reference: <20090313090042.43CD.A69D9226@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_workqueue.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index fb5ccac..9ab035b 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -193,12 +193,20 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct cpu_workqueue_stats *cws = p;
 	unsigned long flags;
 	int cpu = cws->cpu;
-	struct task_struct *tsk = find_task_by_vpid(cws->pid);
-
-	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
-		   atomic_read(&cws->inserted),
-		   cws->executed,
-		   tsk ? tsk->comm : "<...>");
+	struct pid *pid;
+	struct task_struct *tsk;
+
+	pid = find_get_pid(cws->pid);
+	if (pid) {
+		tsk = get_pid_task(pid, PIDTYPE_PID);
+		if (tsk) {
+			seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
+				   atomic_read(&cws->inserted), cws->executed,
+				   tsk->comm);
+			put_task_struct(tsk);
+		}
+		put_pid(pid);
+	}
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-- 
cgit v1.1


From f28e55765e40450c127e44d00ae65d0cd1a4efec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 22:00:19 -0400
Subject: tracing: show event name in trace for TRACE_EVENT created events

Unlike TRACE_FORMAT() macros, the TRACE_EVENT() macros do not show
the event name in the trace file. Knowing the event type in the trace
output is very useful.

Instead of:

   task swapper:0 [140] ==> ntpd:3308 [120]

We now have:

   sched_switch: task swapper:0 [140] ==> ntpd:3308 [120]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index ca347af..5117c43 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -57,7 +57,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, print);				\
+	ret = trace_seq_printf(s, #call ": " print);			\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
-- 
cgit v1.1


From 5cc985488845ec7227a2c5cfd2fd62cf57fb411a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 22:24:17 -0400
Subject: ring-buffer: document reader page design

In a private email conversation I explained how the ring buffer
page worked by using silly ASCII art. Ingo suggested that I add
that to the comments of the code.

Here it is.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2c36be9..58128ad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,74 @@
 #include "trace.h"
 
 /*
+ * The ring buffer is made up of a list of pages. A separate list of pages is
+ * allocated for each CPU. A writer may only write to a buffer that is
+ * associated with the CPU it is currently executing on.  A reader may read
+ * from any per cpu buffer.
+ *
+ * The reader is special. For each per cpu buffer, the reader has its own
+ * reader page. When a reader has read the entire reader page, this reader
+ * page is swapped with another page in the ring buffer.
+ *
+ * Now, as long as the writer is off the reader page, the reader can do what
+ * ever it wants with that page. The writer will never write to that page
+ * again (as long as it is out of the ring buffer).
+ *
+ * Here's some silly ASCII art.
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |
+ *   +------+        +---+   +---+   +---+
+ *                   |   |-->|   |-->|   |
+ *                   +---+   +---+   +---+
+ *                     ^               |
+ *                     |               |
+ *                     +---------------+
+ *
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *                   |   |-->|   |-->|   |
+ *                   +---+   +---+   +---+
+ *                     ^               |
+ *                     |               |
+ *                     +---------------+
+ *
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *      ^            |   |-->|   |-->|   |
+ *      |            +---+   +---+   +---+
+ *      |                              |
+ *      |                              |
+ *      +------------------------------+
+ *
+ *
+ *   +------+
+ *   |buffer|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *      ^            |   |   |   |-->|   |
+ *      |   New      +---+   +---+   +---+
+ *      |  Reader------^               |
+ *      |   page                       |
+ *      +------------------------------+
+ *
+ *
+ * After we make this swap, the reader can hand this page off to the splice
+ * code and be done with it. It can even allocate a new page if it needs to
+ * and swap that into the ring buffer.
+ *
+ * We will be using cmpxchg soon to make all this lockless.
+ *
+ */
+
+/*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
  * prevents all ring buffers from being recorded to.
-- 
cgit v1.1


From eb1871f34358024acfa3523ef375ef14b7527173 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:00:58 -0400
Subject: tracing: left align location header in stack_trace

Ingo Molnar suggested, instead of:

        Depth    Size      Location    (27 entries)
        -----    ----      --------
  0)     2880      48   lock_timer_base+0x2b/0x4f
  1)     2832      80   __mod_timer+0x33/0xe0
  2)     2752      16   __ide_set_handler+0x63/0x65

To have it be:

        Depth    Size   Location    (27 entries)
        -----    ----   --------
  0)     2880      48   lock_timer_base+0x2b/0x4f
  1)     2832      80   __mod_timer+0x33/0xe0
  2)     2752      16   __ide_set_handler+0x63/0x65

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 91ccbf3..c750f65 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -262,9 +262,9 @@ static int t_show(struct seq_file *m, void *v)
 	int size;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(m, "        Depth    Size      Location"
+		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
-			   "        -----    ----      --------\n",
+			   "        -----    ----   --------\n",
 			   max_stack_trace.nr_entries);
 
 		if (!stack_tracer_enabled && !max_stack_size)
-- 
cgit v1.1


From bdc067582b8b71c7771bab076bbc51569c594fb4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:12:52 -0400
Subject: tracing: add comment for use of double __builtin_consant_p

Impact: documentation

The use of the double __builtin_contant_p checks in the event_trace_printk
can be confusing to developers and reviewers. This patch adds a comment
to explain why it is there.

Requested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
LKML-Reference: <20090313122235.43EB.A69D9226@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 35cfa7b..67595b8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -776,6 +776,11 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
 #define event_trace_printk(ip, fmt, args...)				\
 do {									\
 	__trace_printk_check_format(fmt, ##args);			\
-- 
cgit v1.1


From c69fc56de1df5769f2ec69c915c7ad5afe63804c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 14:49:46 +1030
Subject: cpumask: use topology_core_cpumask/topology_thread_cpumask instead of
 cpu_core_map/cpu_sibling_map

Impact: cleanup

This is presumably what those definitions are for, and while all archs
define cpu_core_map/cpu_sibling map, that's changing (eg. x86 wants to
change it to a pointer).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a76d0b..5dabd80 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7249,7 +7249,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 {
 	int group;
 
-	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
@@ -7278,7 +7278,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #else
 	group = cpu;
@@ -7621,7 +7621,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		cpumask_and(sched_domain_span(sd),
-			    &per_cpu(cpu_sibling_map, i), cpu_map);
+			    topology_thread_cpumask(i), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7632,7 +7632,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
 		cpumask_and(this_sibling_map,
-			    &per_cpu(cpu_sibling_map, i), cpu_map);
+			    topology_thread_cpumask(i), cpu_map);
 		if (i != cpumask_first(this_sibling_map))
 			continue;
 
-- 
cgit v1.1


From 7f96f93f02b7637491a1637dee12dcdcd40b9802 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:37:42 -0400
Subject: tracing: move binary buffers into per cpu directory

The binary_buffers directory in /debugfs/tracing held the files
to read the trace buffers in a binary format. This held one file
per CPU buffer. But we also have a per_cpu directory that holds
a way to read the pretty-print formats.

This patch moves the binary buffers into the per_cpu_directory:

 # ls /debug/tracing/per_cpu/cpu1/
trace  trace_pipe  trace_pipe_raw

The new name is called "trace_pipe_raw". The binary buffers always
acted similar to trace_pipe, except that they produce raw data.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dbb077d..efe3202 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3543,6 +3543,11 @@ static void tracing_init_debugfs_percpu(long cpu)
 				(void *) cpu, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
+				    (void *) cpu, &tracing_buffers_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3826,7 +3831,6 @@ static __init void create_trace_options_dir(void)
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *buffers;
 	struct dentry *entry;
 	int cpu;
 
@@ -3899,26 +3903,6 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_marker' entry\n");
 
-	buffers = debugfs_create_dir("binary_buffers", d_tracer);
-
-	if (!buffers)
-		pr_warning("Could not create buffers directory\n");
-	else {
-		int cpu;
-		char buf[64];
-
-		for_each_tracing_cpu(cpu) {
-			sprintf(buf, "%d", cpu);
-
-			entry = debugfs_create_file(buf, 0444, buffers,
-						    (void *)(long)cpu,
-						    &tracing_buffers_fops);
-			if (!entry)
-				pr_warning("Could not create debugfs buffers "
-					   "'%s' entry\n", buf);
-		}
-	}
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
-- 
cgit v1.1


From 899039e8746bb9a09b6487ddb8ab2275ce9d0256 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:43:33 -0400
Subject: softirq: no need to have SOFTIRQ in softirq name

Impact: clean up

It is redundant to have 'SOFTIRQ' in the softirq names.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/softirq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index a5e8123..65ff3e3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -55,9 +55,8 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
-	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
-	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
-	"RCU_SOFTIRQ"
+	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
+	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
 /*
-- 
cgit v1.1


From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Mar 2009 05:52:59 +0100
Subject: tracing/ftrace: syscall tracing infrastructure, basics

Provide basic callbacks to do syscall tracing.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com>
[ simplified it to a trace_printk() for now. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig          |  10 ++++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.h          |   2 +
 kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 126 insertions(+)
 create mode 100644 kernel/trace/trace_syscalls.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8e4a2a6..95a0ad1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD
 config HAVE_HW_BRANCH_TRACER
 	bool
 
+config HAVE_FTRACE_SYSCALLS
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -175,6 +178,13 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+config FTRACE_SYSCALLS
+	bool "Trace syscalls"
+	depends on HAVE_FTRACE_SYSCALLS
+	select TRACING
+	help
+	  Basic tracer to catch the syscall entry and exit events.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	select TRACING
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7a2943..c3feea0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d88..3d49daa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_SYSCALL_ENTER,
+	TRACE_SYSCALL_EXIT,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 0000000..66cf974
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,113 @@
+#include <linux/ftrace.h>
+#include <linux/kernel.h>
+
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static atomic_t refcount;
+
+void start_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_inc_return(&refcount) != 1)
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_dec(&refcount);
+}
+
+void stop_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_dec_return(&refcount))
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_inc(&refcount);
+}
+
+void ftrace_syscall_enter(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d enter\n", syscall_nr);
+}
+
+void ftrace_syscall_exit(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d exit\n", syscall_nr);
+}
+
+static int init_syscall_tracer(struct trace_array *tr)
+{
+	start_ftrace_syscalls();
+
+	return 0;
+}
+
+static void reset_syscall_tracer(struct trace_array *tr)
+{
+	stop_ftrace_syscalls();
+}
+
+static struct trace_event syscall_enter_event = {
+	.type		= TRACE_SYSCALL_ENTER,
+};
+
+static struct trace_event syscall_exit_event = {
+	.type		= TRACE_SYSCALL_EXIT,
+};
+
+static struct tracer syscall_tracer __read_mostly = {
+	.name		= "syscall",
+	.init		= init_syscall_tracer,
+	.reset		= reset_syscall_tracer
+};
+
+__init int register_ftrace_syscalls(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&syscall_enter_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_enter_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	ret = register_ftrace_event(&syscall_exit_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_exit_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	return register_tracer(&syscall_tracer);
+}
+device_initcall(register_ftrace_syscalls);
-- 
cgit v1.1


From b00f0b6dc1773b4c8f538503247da050b5ea631b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:14:01 +0800
Subject: ftrace: avoid double-free of dyn_ftrace

If dyn_ftrace is freed before ftrace_release(), ftrace_release()
will free it again and make ftrace_free_records wrong.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
LKML-Reference: <49BA23D9.1050900@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d33d306..26c45aa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -356,7 +356,8 @@ void ftrace_release(void *start, unsigned long size)
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e))
+		if ((rec->ip >= s) && (rec->ip < e) &&
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
-- 
cgit v1.1


From fa9d13cf135efbd454453a53b6299976bea245a9 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:16:34 +0800
Subject: ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED
 rec

Do __ftrace_replace_code for !FTRACE_FL_CONVERTED rec will always
fail, we should ignore this rec.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
LKML-Reference: <49BA2472.4060206@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 26c45aa..08f4a62 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -532,11 +532,12 @@ static void ftrace_replace_code(int enable)
 
 	do_for_each_ftrace_rec(pg, rec) {
 		/*
-		 * Skip over free records and records that have
-		 * failed.
+		 * Skip over free records, records that have
+		 * failed and not converted.
 		 */
 		if (rec->flags & FTRACE_FL_FREE ||
-		    rec->flags & FTRACE_FL_FAILED)
+		    rec->flags & FTRACE_FL_FAILED ||
+		    rec->flags & FTRACE_FL_CONVERTED)
 			continue;
 
 		/* ignore updates to this record's mcount site */
@@ -548,7 +549,7 @@ static void ftrace_replace_code(int enable)
 		}
 
 		failed = __ftrace_replace_code(rec, enable);
-		if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+		if (failed) {
 			rec->flags |= FTRACE_FL_FAILED;
 			if ((system_state == SYSTEM_BOOTING) ||
 			    !core_kernel_text(rec->ip)) {
-- 
cgit v1.1


From 641cd4cfcdc71ce01535b31cc4d57d59a1fae1fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:47:34 +0100
Subject: generic-ipi: eliminate WARN_ON()s during oops/panic

Do not output smp-call related warnings in the oops/panic codepath.

Reported-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 7ad2262..858baac 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -5,6 +5,7 @@
  */
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
@@ -285,7 +286,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	this_cpu = get_cpu();
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
 
 	if (cpu == this_cpu) {
 		local_irq_save(flags);
@@ -329,7 +330,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	csd_lock(data);
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(wait && irqs_disabled());
+	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
 
 	generic_exec_single(cpu, data, wait);
 }
@@ -365,7 +366,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
 
 	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-- 
cgit v1.1


From ffd71da4e3f323b7673b061e6f7e0d0c12dc2b49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:54:24 +0100
Subject: panic: decrease oops_in_progress only after having done the panic

Impact: eliminate secondary warnings during panic()

We can panic() in a number of difficult, atomic contexts, hence
we use bust_spinlocks(1) in panic() to increase oops_in_progress,
which prevents various debug checks we have in place.

But in practice this protection only covers the first few printk's
done by panic() - it does not cover the later attempt to stop all
other CPUs and kexec(). If a secondary warning triggers in one of
those facilities that can make the panic message scroll off.

So do bust_spinlocks(0) only much later in panic(). (which code
is only reached if panic policy is relaxed that it can return
after a warning message)

Reported-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 32fe4ef..57fb005 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -77,7 +77,6 @@ NORET_TYPE void panic(const char * fmt, ...)
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 	dump_stack();
 #endif
-	bust_spinlocks(0);
 
 	/*
 	 * If we have crashed and we have a crash kernel loaded let it handle
@@ -136,6 +135,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		mdelay(1);
 		i++;
 	}
+	bust_spinlocks(0);
 }
 
 EXPORT_SYMBOL(panic);
-- 
cgit v1.1


From d1dedb52acd98bd5e13e1ff4c4d045d58bbd16fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic, smp: provide smp_send_stop() wrapper on UP too

Impact: cleanup, no code changed

Remove an ugly #ifdef CONFIG_SMP from panic(), by providing
an smp_send_stop() wrapper on UP too.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 57fb005..ca75e81 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -85,14 +85,12 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-#ifdef CONFIG_SMP
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
 	smp_send_stop();
-#endif
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-- 
cgit v1.1


From c95dbf27e201587b2a6cf63f8501a0313d9b4801 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic: clean up kernel/panic.c

Impact: cleanup, no code changed

Clean up kernel/panic.c some more and make it more consistent.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 111 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 59 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index ca75e81..3fd8c5b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
  * This function is used through-out the kernel (including mm and fs)
  * to indicate a major problem.
  */
+#include <linux/debug_locks.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
 #include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/random.h>
 #include <linux/reboot.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
 #include <linux/sysrq.h>
-#include <linux/interrupt.h>
+#include <linux/init.h>
 #include <linux/nmi.h>
-#include <linux/kexec.h>
-#include <linux/debug_locks.h>
-#include <linux/random.h>
-#include <linux/kallsyms.h>
 #include <linux/dmi.h>
 
 int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
  *
  *	This function never returns.
  */
-
 NORET_TYPE void panic(const char * fmt, ...)
 {
-	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_S390)
-	unsigned long caller = (unsigned long) __builtin_return_address(0);
-#endif
+	long i;
 
 	/*
-	 * It's possible to come here directly from a panic-assertion and not
-	 * have preempt disabled. Some functions called from here want
+	 * It's possible to come here directly from a panic-assertion and
+	 * not have preempt disabled. Some functions called from here want
 	 * preempt to be disabled. No point enabling it later though...
 	 */
 	preempt_disable();
@@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
 
 	if (panic_timeout > 0) {
 		/*
-	 	 * Delay timeout seconds before rebooting the machine. 
-		 * We can't use the "normal" timers since we just panicked..
-	 	 */
-		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+		 * Delay timeout seconds before rebooting the machine.
+		 * We can't use the "normal" timers since we just panicked.
+		 */
+		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+
 		for (i = 0; i < panic_timeout*1000; ) {
 			touch_nmi_watchdog();
 			i += panic_blink(i);
 			mdelay(1);
 			i++;
 		}
-		/*	This will not be a clean reboot, with everything
-		 *	shutting down.  But if there is a chance of
-		 *	rebooting the system it will be rebooted.
+		/*
+		 * This will not be a clean reboot, with everything
+		 * shutting down.  But if there is a chance of
+		 * rebooting the system it will be rebooted.
 		 */
 		emergency_restart();
 	}
@@ -124,10 +122,15 @@ NORET_TYPE void panic(const char * fmt, ...)
 	}
 #endif
 #if defined(CONFIG_S390)
-	disabled_wait(caller);
+	{
+		unsigned long caller;
+
+		caller = (unsigned long)__builtin_return_address(0);
+		disabled_wait(caller);
+	}
 #endif
 	local_irq_enable();
-	for (i = 0;;) {
+	for (i = 0; ; ) {
 		touch_softlockup_watchdog();
 		i += panic_blink(i);
 		mdelay(1);
@@ -140,23 +143,23 @@ EXPORT_SYMBOL(panic);
 
 
 struct tnt {
-	u8 bit;
-	char true;
-	char false;
+	u8	bit;
+	char	true;
+	char	false;
 };
 
 static const struct tnt tnts[] = {
-	{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
-	{ TAINT_FORCED_MODULE, 'F', ' ' },
-	{ TAINT_UNSAFE_SMP, 'S', ' ' },
-	{ TAINT_FORCED_RMMOD, 'R', ' ' },
-	{ TAINT_MACHINE_CHECK, 'M', ' ' },
-	{ TAINT_BAD_PAGE, 'B', ' ' },
-	{ TAINT_USER, 'U', ' ' },
-	{ TAINT_DIE, 'D', ' ' },
-	{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
-	{ TAINT_WARN, 'W', ' ' },
-	{ TAINT_CRAP, 'C', ' ' },
+	{ TAINT_PROPRIETARY_MODULE,	'P', 'G' },
+	{ TAINT_FORCED_MODULE,		'F', ' ' },
+	{ TAINT_UNSAFE_SMP,		'S', ' ' },
+	{ TAINT_FORCED_RMMOD,		'R', ' ' },
+	{ TAINT_MACHINE_CHECK,		'M', ' ' },
+	{ TAINT_BAD_PAGE,		'B', ' ' },
+	{ TAINT_USER,			'U', ' ' },
+	{ TAINT_DIE,			'D', ' ' },
+	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' },
+	{ TAINT_WARN,			'W', ' ' },
+	{ TAINT_CRAP,			'C', ' ' },
 };
 
 /**
@@ -193,7 +196,8 @@ const char *print_tainted(void)
 		*s = 0;
 	} else
 		snprintf(buf, sizeof(buf), "Not tainted");
-	return(buf);
+
+	return buf;
 }
 
 int test_taint(unsigned flag)
@@ -209,7 +213,8 @@ unsigned long get_taint(void)
 
 void add_taint(unsigned flag)
 {
-	debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+	/* can't trust the integrity of the kernel anymore: */
+	debug_locks = 0;
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
@@ -264,8 +269,8 @@ static void do_oops_enter_exit(void)
 }
 
 /*
- * Return true if the calling CPU is allowed to print oops-related info.  This
- * is a bit racy..
+ * Return true if the calling CPU is allowed to print oops-related info.
+ * This is a bit racy..
  */
 int oops_may_print(void)
 {
@@ -274,20 +279,22 @@ int oops_may_print(void)
 
 /*
  * Called when the architecture enters its oops handler, before it prints
- * anything.  If this is the first CPU to oops, and it's oopsing the first time
- * then let it proceed.
+ * anything.  If this is the first CPU to oops, and it's oopsing the first
+ * time then let it proceed.
  *
- * This is all enabled by the pause_on_oops kernel boot option.  We do all this
- * to ensure that oopses don't scroll off the screen.  It has the side-effect
- * of preventing later-oopsing CPUs from mucking up the display, too.
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all
+ * this to ensure that oopses don't scroll off the screen.  It has the
+ * side-effect of preventing later-oopsing CPUs from mucking up the display,
+ * too.
  *
- * It turns out that the CPU which is allowed to print ends up pausing for the
- * right duration, whereas all the other CPUs pause for twice as long: once in
- * oops_enter(), once in oops_exit().
+ * It turns out that the CPU which is allowed to print ends up pausing for
+ * the right duration, whereas all the other CPUs pause for twice as long:
+ * once in oops_enter(), once in oops_exit().
  */
 void oops_enter(void)
 {
-	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+	/* can't trust the integrity of the kernel anymore: */
+	debug_locks_off();
 	do_oops_enter_exit();
 }
 
-- 
cgit v1.1


From 850a80cfaa5aec3e626eb3736eff890a80e4fa77 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:47:23 +0800
Subject: ftrace: use seq_read

Impact: cleanup

VFS layer has tested the file mode, we do not need test it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2BAB.6010608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08f4a62..bf78a4c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1120,16 +1120,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 	return ftrace_regex_open(inode, file, 0);
 }
 
-static ssize_t
-ftrace_regex_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	if (file->f_mode & FMODE_READ)
-		return seq_read(file, ubuf, cnt, ppos);
-	else
-		return -EPERM;
-}
-
 static loff_t
 ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
@@ -1882,7 +1872,7 @@ static const struct file_operations ftrace_failures_fops = {
 
 static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
-	.read = ftrace_regex_read,
+	.read = seq_read,
 	.write = ftrace_filter_write,
 	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
@@ -1890,7 +1880,7 @@ static const struct file_operations ftrace_filter_fops = {
 
 static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
-	.read = ftrace_regex_read,
+	.read = seq_read,
 	.write = ftrace_notrace_write,
 	.llseek = ftrace_regex_lseek,
 	.release = ftrace_notrace_release,
@@ -1992,16 +1982,6 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static ssize_t
-ftrace_graph_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	if (file->f_mode & FMODE_READ)
-		return seq_read(file, ubuf, cnt, ppos);
-	else
-		return -EPERM;
-}
-
 static int
 ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
@@ -2132,7 +2112,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 
 static const struct file_operations ftrace_graph_fops = {
 	.open = ftrace_graph_open,
-	.read = ftrace_graph_read,
+	.read = seq_read,
 	.write = ftrace_graph_write,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-- 
cgit v1.1


From e94142a67f8bad494c593f0a07c9fc2fbec98c0e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:51:27 +0800
Subject: ftrace: remove struct list_head from struct dyn_ftrace

Impact: save memory

The struct dyn_ftrace table is very large, this patch will save
about 50%.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2C9F.8020009@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bf78a4c..90d5729 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -272,7 +272,7 @@ enum {
 
 static int ftrace_filtered;
 
-static LIST_HEAD(ftrace_new_addrs);
+static struct dyn_ftrace *ftrace_new_addrs;
 
 static DEFINE_MUTEX(ftrace_regex_lock);
 
@@ -409,8 +409,8 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-
-	list_add(&rec->list, &ftrace_new_addrs);
+	rec->flags = (unsigned long)ftrace_new_addrs;
+	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -716,19 +716,21 @@ unsigned long		ftrace_update_tot_cnt;
 
 static int ftrace_update_code(struct module *mod)
 {
-	struct dyn_ftrace *p, *t;
+	struct dyn_ftrace *p;
 	cycle_t start, stop;
 
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+	while (ftrace_new_addrs) {
 
 		/* If something went wrong, bail without enabling anything */
 		if (unlikely(ftrace_disabled))
 			return -1;
 
-		list_del_init(&p->list);
+		p = ftrace_new_addrs;
+		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
 		if (ftrace_code_disable(mod, p)) {
-- 
cgit v1.1


From bed1ffca022cc876fb83161d26670e9b5d3cf36b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Mar 2009 15:42:11 +0100
Subject: tracing/syscalls: core infrastructure for syscalls tracing,
 enhancements

Impact: new feature

This adds the generic support for syscalls tracing. This is
currently exploited through a devoted tracer but other tracing
engines can use it. (They just have to play with
{start,stop}_ftrace_syscalls() and use the display callbacks
unless they want to override them.)

The syscalls prototypes definitions are abused here to steal
some metadata informations:

- syscall name, param types, param names, number of params

The syscall addr is not directly saved during this definition
because we don't know if its prototype is available in the
namespace. But we don't really need it. The arch has just to
build a function able to resolve the syscall number to its
metadata struct.

The current tracer prints the syscall names, parameters names
and values (and their types optionally). Currently the value is
a raw hex but higher level values diplaying is on my TODO list.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236955332-10133-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h          |  17 +++++
 kernel/trace/trace_syscalls.c | 146 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 155 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3d49daa..d80ca0d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -194,6 +194,19 @@ struct kmemtrace_free_entry {
 	const void *ptr;
 };
 
+struct syscall_trace_enter {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		args[];
+};
+
+struct syscall_trace_exit {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		ret;
+};
+
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -306,6 +319,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct syscall_trace_enter,		\
+			  TRACE_SYSCALL_ENTER);				\
+		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
+			  TRACE_SYSCALL_EXIT);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 66cf974..c72e599 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,5 @@
-#include <linux/ftrace.h>
 #include <linux/kernel.h>
-
+#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -8,6 +7,90 @@
 
 static atomic_t refcount;
 
+/* Our two options */
+enum {
+	TRACE_SYSCALLS_OPT_TYPES = 0x1,
+};
+
+static struct tracer_opt syscalls_opts[] = {
+	{ TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
+	{ }
+};
+
+static struct tracer_flags syscalls_flags = {
+	.val = 0, /* By default: no args types */
+	.opts = syscalls_opts
+};
+
+enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_enter *trace;
+	struct syscall_metadata *entry;
+	int i, ret, syscall;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry)
+		goto end;
+
+	ret = trace_seq_printf(s, "%s(", entry->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	for (i = 0; i < entry->nb_args; i++) {
+		/* parameter types */
+		if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+			ret = trace_seq_printf(s, "%s ", entry->types[i]);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* parameter values */
+		ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
+				       trace->args[i],
+				       i == entry->nb_args - 1 ? ")" : ",");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+end:
+	trace_seq_printf(s, "\n");
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_exit *trace;
+	int syscall;
+	struct syscall_metadata *entry;
+	int ret;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry) {
+		trace_seq_printf(s, "\n");
+		return TRACE_TYPE_HANDLED;
+	}
+
+	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+				trace->ret);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 void start_ftrace_syscalls(void)
 {
 	unsigned long flags;
@@ -16,6 +99,7 @@ void start_ftrace_syscalls(void)
 	if (atomic_inc_return(&refcount) != 1)
 		goto out;
 
+	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, t) {
@@ -48,20 +132,63 @@ out:
 
 void ftrace_syscall_enter(struct pt_regs *regs)
 {
+	struct syscall_trace_enter *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
+	int size;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d enter\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
+							0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 void ftrace_syscall_exit(struct pt_regs *regs)
 {
+	struct syscall_trace_exit *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d exit\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
+				sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	entry->ret = syscall_get_return_value(current, regs);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 static int init_syscall_tracer(struct trace_array *tr)
@@ -77,17 +204,20 @@ static void reset_syscall_tracer(struct trace_array *tr)
 }
 
 static struct trace_event syscall_enter_event = {
-	.type		= TRACE_SYSCALL_ENTER,
+	.type	 	= TRACE_SYSCALL_ENTER,
+	.trace		= print_syscall_enter,
 };
 
 static struct trace_event syscall_exit_event = {
-	.type		= TRACE_SYSCALL_EXIT,
+	.type	 	= TRACE_SYSCALL_EXIT,
+	.trace		= print_syscall_exit,
 };
 
 static struct tracer syscall_tracer __read_mostly = {
-	.name		= "syscall",
+	.name	     	= "syscall",
 	.init		= init_syscall_tracer,
-	.reset		= reset_syscall_tracer
+	.reset		= reset_syscall_tracer,
+	.flags		= &syscalls_flags,
 };
 
 __init int register_ftrace_syscalls(void)
-- 
cgit v1.1


From ac99c58c9e56967037382e31f865b72b10127965 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:35 +0100
Subject: tracing/syscalls: fix missing release of tracing

Impact: fix 'stuck' syscall tracer

The syscall tracer uses a refcounter to enable several users
simultaneously.

But the refcounter did not behave correctly and always restored
its value to 0 after calling start_syscall_tracing(). Therefore,
stop_syscall_tracing() couldn't release correctly the tasks from
tracing.

Also the tracer forgot to reset the buffer when it is released.

Drop the pointless refcount decrement on start_syscall_tracing()
and reset the buffer when we release the tracer.

This fixes two reported issue:

- when we switch from syscall tracer to another tracer, syscall
  tracing continued.

- incorrect use of the refcount.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c72e599..c5fc1d8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -96,8 +96,9 @@ void start_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	/* Don't enable the flag on the tasks twice */
 	if (atomic_inc_return(&refcount) != 1)
-		goto out;
+		return;
 
 	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
@@ -107,8 +108,6 @@ void start_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
-out:
-	atomic_dec(&refcount);
 }
 
 void stop_ftrace_syscalls(void)
@@ -116,8 +115,9 @@ void stop_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	/* There are perhaps still some users */
 	if (atomic_dec_return(&refcount))
-		goto out;
+		return;
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -126,8 +126,6 @@ void stop_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
-out:
-	atomic_inc(&refcount);
 }
 
 void ftrace_syscall_enter(struct pt_regs *regs)
@@ -201,6 +199,7 @@ static int init_syscall_tracer(struct trace_array *tr)
 static void reset_syscall_tracer(struct trace_array *tr)
 {
 	stop_ftrace_syscalls();
+	tracing_reset_online_cpus(tr);
 }
 
 static struct trace_event syscall_enter_event = {
-- 
cgit v1.1


From 6404434525bb9f8f2239998f30fd7c93f2efa5b3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:36 +0100
Subject: tracing/syscalls: various cleanups

Impact: cleanup

- Drop unused cpu variable
- Fix some errors on comments

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c5fc1d8..26f9a86 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,7 +7,7 @@
 
 static atomic_t refcount;
 
-/* Our two options */
+/* Option to display the parameters types */
 enum {
 	TRACE_SYSCALLS_OPT_TYPES = 0x1,
 };
@@ -18,7 +18,7 @@ static struct tracer_opt syscalls_opts[] = {
 };
 
 static struct tracer_flags syscalls_flags = {
-	.val = 0, /* By default: no args types */
+	.val = 0, /* By default: no parameters types */
 	.opts = syscalls_opts
 };
 
@@ -135,12 +135,9 @@ void ftrace_syscall_enter(struct pt_regs *regs)
 	struct ring_buffer_event *event;
 	int size;
 	int syscall_nr;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	cpu = raw_smp_processor_id();
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
@@ -166,12 +163,9 @@ void ftrace_syscall_exit(struct pt_regs *regs)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	int syscall_nr;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	cpu = raw_smp_processor_id();
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
-- 
cgit v1.1


From 5be71b61f17b0e3bc8ad0b1a1b7b53ab7d574ebb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:37 +0100
Subject: tracing/syscalls: protect thread flag toggling from races

Impact: fix syscall tracer enable/disable race

The current thread flag toggling is racy as shown in the following
scenario:

- task A is the last user of syscall tracing, it releases the
  TIF_SYSCALL_FTRACE on each tasks

- at the same time task B start syscall tracing. refcount == 0 so
  it sets up TIF_SYSCALL_FTRACE on each tasks.

The effect of the mixup is unpredictable.
So this fix adds a mutex on {start,stop}_syscall_tracing().

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1237151439-6755-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 26f9a86..a2a3af2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -5,7 +5,11 @@
 #include "trace_output.h"
 #include "trace.h"
 
-static atomic_t refcount;
+/* Keep a counter of the syscall tracing users */
+static int refcount;
+
+/* Prevent from races on thread flags toggling */
+static DEFINE_MUTEX(syscall_trace_lock);
 
 /* Option to display the parameters types */
 enum {
@@ -96,9 +100,11 @@ void start_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	mutex_lock(&syscall_trace_lock);
+
 	/* Don't enable the flag on the tasks twice */
-	if (atomic_inc_return(&refcount) != 1)
-		return;
+	if (++refcount != 1)
+		goto unlock;
 
 	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
@@ -108,6 +114,9 @@ void start_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
+
+unlock:
+	mutex_unlock(&syscall_trace_lock);
 }
 
 void stop_ftrace_syscalls(void)
@@ -115,9 +124,11 @@ void stop_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	mutex_lock(&syscall_trace_lock);
+
 	/* There are perhaps still some users */
-	if (atomic_dec_return(&refcount))
-		return;
+	if (--refcount)
+		goto unlock;
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -126,6 +137,9 @@ void stop_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
+
+unlock:
+	mutex_unlock(&syscall_trace_lock);
 }
 
 void ftrace_syscall_enter(struct pt_regs *regs)
-- 
cgit v1.1


From 0ea1c4156bf9e2eb370cc5c6fa6eb112bd844dec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:38 +0100
Subject: tracing/syscalls: select kallsysms

Syscall tracing must select kallsysms.

The arch code builds a table to find the syscall metadata by syscall
number. It needs the syscalls names resolution from the symbol table
to know which name found on the syscalls metadatas match a function
pointer from the arch sys_call_table.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95a0ad1..b0a46f8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -182,6 +182,7 @@ config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
 	select TRACING
+	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
-- 
cgit v1.1


From 59f586db98919d7d9c43527b26c8de1cdf9ed912 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:39 +0100
Subject: tracing/core: fix missing mutex unlock on tracing_set_tracer()

Impact: fix possible locking imbalance

In case of ring buffer resize failure, tracing_set_tracer forgot to
release trace_types_lock. Fix it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efe3202..c0cf946 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2494,7 +2494,7 @@ static int tracing_set_tracer(const char *buf)
 	if (!ring_buffer_expanded) {
 		ret = tracing_resize_ring_buffer(trace_buf_size);
 		if (ret < 0)
-			return ret;
+			goto out;
 		ret = 0;
 	}
 
-- 
cgit v1.1


From ac1d52d0b85854958c7e78c8006e39aadb6ce4b8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 00:32:41 +0100
Subject: tracing/ftrace: fix double calls to tracing_start()

Impact: fix a warning during preemptirqsoff selftests

When the preemptirqsoff selftest fails, we see the following
warning:

[    6.050000] Testing tracer preemptirqsoff: .. no entries found ..
------------[ cut here ]------------
[    6.060000] WARNING: at kernel/trace/trace.c:688 tracing_start+0x67/0xd3()
[    6.060000] Modules linked in:
[    6.060000] Pid: 1, comm: swapper Tainted: G
[    6.060000] Call Trace:
[    6.060000]  [<ffffffff802460ff>] warn_slowpath+0xb1/0x100
[    6.060000]  [<ffffffff802a8f5b>] ? trace_preempt_on+0x35/0x4b
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff80271e0b>] ? __lock_acquired+0xe6/0x1f2
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff802a3831>] tracing_start+0x67/0xd3
[    6.060000]  [<ffffffff802a8ace>] ? irqsoff_tracer_reset+0x2d/0x57
[    6.060000]  [<ffffffff802a4d1c>] trace_selftest_startup_preemptirqsoff+0x1c8/0x1f1
[    6.060000]  [<ffffffff802a4798>] register_tracer+0x12f/0x241
[    6.060000]  [<ffffffff810250d0>] ? init_irqsoff_tracer+0x0/0x53
[    6.060000]  [<ffffffff8102510b>] init_irqsoff_tracer+0x3b/0x53

This is because in fail case, the preemptirqsoff tracer selftest calls twice
the tracing_start() function:

int
trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
{
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
                ret = -1;
                tracing_start(); <-----
                goto out;
        }
        [...]
out:
        trace->reset(tr);
        tracing_start(); <------
        tracing_max_latency = save_max;

        return ret;
}

Since it is well handled in the out path, we don't need the conditional one.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237159961-7447-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f907a2b..a2ca6f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -414,7 +414,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
-		goto out;
+		goto out_no_start;
 	}
 
 	/* reset the max latency */
@@ -432,21 +432,16 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
-	if (ret) {
-		tracing_start();
+	if (ret)
 		goto out;
-	}
 
 	ret = trace_test_buffer(&max_tr, &count);
-	if (ret) {
-		tracing_start();
+	if (ret)
 		goto out;
-	}
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
-		tracing_start();
 		goto out;
 	}
 
@@ -475,9 +470,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 		goto out;
 	}
 
- out:
-	trace->reset(tr);
+out:
 	tracing_start();
+out_no_start:
+	trace->reset(tr);
 	tracing_max_latency = save_max;
 
 	return ret;
-- 
cgit v1.1


From 2fc1dfbe17e7705c55b7a99da995fa565e26f151 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 01:45:03 +0100
Subject: tracing/core: fix early free of cpumasks

Impact: fix crashes when tracing cpumasks

While ring-buffer allocation, the cpumasks are allocated too,
including the tracing cpumask and the per-cpu file mask handler.
But these cpumasks are freed accidentally just after.
Fix it.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237164303-11476-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0cf946..ae32d3b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4125,7 +4125,8 @@ __init static int tracer_alloc_buffers(void)
 				       &trace_panic_notifier);
 
 	register_die_notifier(&trace_die_notifier);
-	ret = 0;
+
+	return 0;
 
 out_free_cpumask:
 	free_cpumask_var(tracing_reader_cpumask);
-- 
cgit v1.1


From 03303549b1695dc024d4a653cc16bd79f78f9750 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 22:41:00 +0100
Subject: tracing/ftrace: fix the check on nopped sites

Impact: fix a dynamic tracing failure

Recently, the function and function graph tracers failed to use dynamic
tracing after the following commit:

fa9d13cf135efbd454453a53b6299976bea245a9
(ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED rec)

The patch is right except a mistake on the check for the FTRACE_FL_CONVERTED
flag. The code patching is aborted in case of successfully nopped sites.
What we want is the opposite: ignore the callsites that haven't been nopped.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90d5729..7847806 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -537,7 +537,7 @@ static void ftrace_replace_code(int enable)
 		 */
 		if (rec->flags & FTRACE_FL_FREE ||
 		    rec->flags & FTRACE_FL_FAILED ||
-		    rec->flags & FTRACE_FL_CONVERTED)
+		    !(rec->flags & FTRACE_FL_CONVERTED))
 			continue;
 
 		/* ignore updates to this record's mcount site */
-- 
cgit v1.1


From 4ca530852346be239b7c19e7bec5d2b78855bebe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Mar 2009 19:20:15 -0400
Subject: tracing: protect reader of cmdline output

Impact: fix to one cause of incorrect comm outputs in trace

The spinlock only protected the creation of a comm <=> pid pair.
But it was possible that a reader could look up a pid, and get the
wrong comm because it had no locking.

This also required changing trace_find_cmdline to copy the comm cache
and not just send back a pointer to it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c              | 23 ++++++++++++++++++-----
 kernel/trace/trace.c                 | 20 ++++++++++++--------
 kernel/trace/trace.h                 |  2 +-
 kernel/trace/trace_functions_graph.c | 12 ++++++------
 kernel/trace/trace_output.c          | 18 ++++++++++++------
 5 files changed, 49 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1f32e4e..b171778 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1027,7 +1027,9 @@ static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
 
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
-	const char *cmd = trace_find_cmdline(ent->pid);
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
 
 	if (t_sec(ent))
 		return trace_seq_printf(s, "%llu + %u [%s]\n",
@@ -1057,19 +1059,30 @@ static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 {
-	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
+	return trace_seq_printf(s, "[%s]\n", cmd);
 }
 
 static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
 {
-	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
-				get_pdu_int(ent));
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
+	return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
 }
 
 static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 {
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
 	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+				get_pdu_int(ent), cmd);
 }
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efe3202..2796bd2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -770,25 +770,29 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
-char *trace_find_cmdline(int pid)
+void trace_find_cmdline(int pid, char comm[])
 {
-	char *cmdline = "<...>";
 	unsigned map;
 
-	if (!pid)
-		return "<idle>";
+	if (!pid) {
+		strcpy(comm, "<idle>");
+		return;
+	}
 
-	if (pid > PID_MAX_DEFAULT)
-		goto out;
+	if (pid > PID_MAX_DEFAULT) {
+		strcpy(comm, "<...>");
+		return;
+	}
 
+	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	if (map >= SAVED_CMDLINES)
 		goto out;
 
-	cmdline = saved_cmdlines[map];
+	strcpy(comm, saved_cmdlines[map]);
 
  out:
-	return cmdline;
+	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 56ce34d..b0ecad8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -547,7 +547,7 @@ struct tracer_switch_ops {
 };
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
-extern char *trace_find_cmdline(int pid);
+extern void trace_find_cmdline(int pid, char comm[]);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4c38860..6004cca 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -190,15 +190,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 static enum print_line_t
 print_graph_proc(struct trace_seq *s, pid_t pid)
 {
-	int i;
-	int ret;
-	int len;
-	char comm[8];
-	int spaces = 0;
+	char comm[TASK_COMM_LEN];
 	/* sign + log10(MAX_INT) + '\0' */
 	char pid_str[11];
+	int spaces = 0;
+	int ret;
+	int len;
+	int i;
 
-	strncpy(comm, trace_find_cmdline(pid), 7);
+	trace_find_cmdline(pid, comm);
 	comm[7] = '\0';
 	sprintf(pid_str, "%d", pid);
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ea9d3b4..6a4c9de 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -309,9 +309,9 @@ static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
-	char *comm;
+	char comm[TASK_COMM_LEN];
 
-	comm = trace_find_cmdline(entry->pid);
+	trace_find_cmdline(entry->pid, comm);
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 
@@ -346,10 +346,12 @@ int trace_print_context(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
-	char *comm = trace_find_cmdline(entry->pid);
 	unsigned long long t = ns2usecs(iter->ts);
 	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
 	unsigned long secs = (unsigned long)t;
+	char comm[TASK_COMM_LEN];
+
+	trace_find_cmdline(entry->pid, comm);
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
 				comm, entry->pid, iter->cpu, secs, usec_rem);
@@ -372,7 +374,10 @@ int trace_print_lat_context(struct trace_iterator *iter)
 	rel_usecs = ns2usecs(next_ts - iter->ts);
 
 	if (verbose) {
-		char *comm = trace_find_cmdline(entry->pid);
+		char comm[TASK_COMM_LEN];
+
+		trace_find_cmdline(entry->pid, comm);
+
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
 				       entry->pid, iter->cpu, entry->flags,
@@ -577,14 +582,15 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 					     char *delim)
 {
 	struct ctx_switch_entry *field;
-	char *comm;
+	char comm[TASK_COMM_LEN];
 	int S, T;
 
+
 	trace_assign_type(field, iter->ent);
 
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
-	comm = trace_find_cmdline(field->next_pid);
+	trace_find_cmdline(field->next_pid, comm);
 	if (!trace_seq_printf(&iter->seq,
 			      " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 			      field->prev_pid,
-- 
cgit v1.1


From 6adaad14d7d4d3ef31b4e2dc992b18b5da7c4eb3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Mar 2009 21:57:17 -0400
Subject: tracing: stop comm recording on tracing off

Impact: fix for losing comms in trace

The command lines of tasks are cached at sched switch to not need
to record them at every trace point.  Disabling the tracing on stops
the recording of traces, but does not stop the caching of command lines.
When the tracing is off the cache may overflow and cause the tracing
to show incorrect tasks matching the PIDs.

This patch disables prevents updates to the comm cache when the ring buffer
is off.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2796bd2..8f89690 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -797,7 +797,7 @@ void trace_find_cmdline(int pid, char comm[])
 
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled))
+	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
 		return;
 
 	trace_save_cmdline(tsk);
-- 
cgit v1.1


From c269fc8c537d761f36cb98e637ae934d9331a9d5 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 17 Mar 2009 01:20:59 -0500
Subject: tracing: fix leak in event_format_read()

Impact: fix memory leak

If event_format_read() exits early due to nonzero ppos, the
previous kmalloc doesn't get freed - might as well do the
check before the kmalloc and avoid the problem.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237270859.8033.141.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 238ea95..c88227b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -378,15 +378,15 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	char *buf;
 	int r;
 
+	if (*ppos)
+		return 0;
+
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
 
 	trace_seq_init(s);
 
-	if (*ppos)
-		return 0;
-
 	/* If any of the first writes fail, so will the show_format. */
 
 	trace_seq_printf(s, "name: %s\n", call->name);
-- 
cgit v1.1


From f2d28a2ebcb525a6ec7e2152106ddb385ef52b73 Mon Sep 17 00:00:00 2001
From: Guillaume Knispel <gknispel@proformatique.com>
Date: Tue, 17 Mar 2009 16:18:42 +0100
Subject: printk: correct the behavior of printk_timed_ratelimit()

Impact: fix jiffies-comparison sign-wrap behavior

The behavior provided by printk_timed_ratelimit() is, in some
situations, probably not what a caller would reasonably expect:

bool printk_timed_ratelimit(unsigned long *caller_jiffies,
			unsigned int interval_msecs)
{
	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
		return true;
	}
	return false;
}

On a 32 bit computer, if printk_timed_ratelimit() is initially called at
time jiffies == Ja, *caller_jiffies is set to
Ja + msecs_to_jiffies(interval_msecs): let's say Ja + 42 for this
example.

If this caller then doesn't call printk_timed_ratelimit() until
jiffies == Ja + (1 << 31) + 42 (which can happen as soon as ~ 25 days
later on a 1000 HZ system), printk_timed_ratelimit() will then always
return false to this caller until jiffies loops completely (1 << 31 more
ticks).

Ths change makes it only return false if jiffies is in the small
time window starting at the previous call when true was returned and
ending interval_msecs later.  Note that if jiffies loops completely
between two calls to printk_timed_ratelimit(), it will obviously still
wrongly return false, but this is something with a low probability.

If something completely reliable is needed I guess jiffies_64 must be
used (which this change does not do).

Signed-off-by: Guillaume Knispel <gknispel@proformatique.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090317161842.0059096b@xilun.lan.proformatique.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0..2be7199 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1292,8 +1292,11 @@ EXPORT_SYMBOL(printk_ratelimit);
 bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 			unsigned int interval_msecs)
 {
-	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
-		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+	if (*caller_jiffies == 0
+			|| !time_in_range(jiffies, *caller_jiffies,
+					*caller_jiffies
+					+ msecs_to_jiffies(interval_msecs))) {
+		*caller_jiffies = jiffies;
 		return true;
 	}
 	return false;
-- 
cgit v1.1


From 37886f6a9f62d22530ffee8d3f9215c8345b6969 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 17:22:06 -0400
Subject: ring-buffer: add api to allow a tracer to change clock source

This patch adds a new function called ring_buffer_set_clock that
allows a tracer to assign its own clock source to the buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 65 +++++++++++++++++++++++++++-------------------
 kernel/trace/trace.c       | 21 ++++++++++-----
 2 files changed, 52 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 58128ad..bbf5192 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -180,29 +180,6 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #include "trace.h"
 
-/* Up this if you want to test the TIME_EXTENTS and normalization */
-#define DEBUG_SHIFT 0
-
-u64 ring_buffer_time_stamp(int cpu)
-{
-	u64 time;
-
-	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = trace_clock_local() << DEBUG_SHIFT;
-	preempt_enable_no_resched_notrace();
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
-{
-	/* Just stupid testing the normalize function and deltas */
-	*ts >>= DEBUG_SHIFT;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
@@ -374,6 +351,7 @@ struct ring_buffer {
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
+	u64				(*clock)(void);
 };
 
 struct ring_buffer_iter {
@@ -394,6 +372,30 @@ struct ring_buffer_iter {
 		_____ret;					\
 	})
 
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	u64 time;
+
+	preempt_disable_notrace();
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	time = buffer->clock() << DEBUG_SHIFT;
+	preempt_enable_no_resched_notrace();
+
+	return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts)
+{
+	/* Just stupid testing the normalize function and deltas */
+	*ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -569,6 +571,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
+	buffer->clock = trace_clock_local;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -645,6 +648,12 @@ ring_buffer_free(struct ring_buffer *buffer)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free);
 
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void))
+{
+	buffer->clock = clock;
+}
+
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
 static void
@@ -1191,7 +1200,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
 			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
@@ -1334,7 +1343,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
@@ -2051,7 +2060,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
@@ -2112,7 +2122,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = iter->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f89690..3be2f78 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-cycle_t ftrace_now(int cpu)
-{
-	u64 ts = ring_buffer_time_stamp(cpu);
-	ring_buffer_normalize_time_stamp(cpu, &ts);
-	return ts;
-}
-
 /*
  * The global_trace is the descriptor that holds the tracing
  * buffers for the live tracing. For each CPU, it contains
@@ -178,6 +171,20 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+cycle_t ftrace_now(int cpu)
+{
+	u64 ts;
+
+	/* Early boot up does not have a buffer yet */
+	if (!global_trace.buffer)
+		return trace_clock_local();
+
+	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+
+	return ts;
+}
+
 /*
  * The max_tr is used to snapshot the global_trace when a maximum
  * latency is reached. Some tracers will use this to store a maximum
-- 
cgit v1.1


From af4617bdba34aa556272b34c3986b0a4d588f568 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 18:09:55 -0400
Subject: tracing: add global-clock option to provide cross CPU clock to traces

Impact: feature to allow better serialized clock

This patch adds an option called "global-clock" that will allow
the tracer to switch to a slower but more accurate (across CPUs)
clock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 34 ++++++++++++++++++++++++++++++----
 kernel/trace/trace.h |  1 +
 2 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3be2f78..2f994ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -315,6 +315,7 @@ static const char *trace_options[] = {
 	"printk-msg-only",
 	"context-info",
 	"latency-format",
+	"global-clock",
 	NULL
 };
 
@@ -2251,6 +2252,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 	return 0;
 }
 
+static void set_tracer_flags(unsigned int mask, int enabled)
+{
+	/* do nothing if flag is already set */
+	if (!!(trace_flags & mask) == !!enabled)
+		return;
+
+	if (enabled)
+		trace_flags |= mask;
+	else
+		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_GLOBAL_CLK) {
+		u64 (*func)(void);
+
+		if (enabled)
+			func = trace_clock_global;
+		else
+			func = trace_clock_local;
+
+		mutex_lock(&trace_types_lock);
+		ring_buffer_set_clock(global_trace.buffer, func);
+
+		if (max_tr.buffer)
+			ring_buffer_set_clock(max_tr.buffer, func);
+		mutex_unlock(&trace_types_lock);
+	}
+}
+
 static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
@@ -2278,10 +2307,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 		int len = strlen(trace_options[i]);
 
 		if (strncmp(cmp, trace_options[i], len) == 0) {
-			if (neg)
-				trace_flags &= ~(1 << i);
-			else
-				trace_flags |= (1 << i);
+			set_tracer_flags(1 << i, !neg);
 			break;
 		}
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0ecad8..26a7a28 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -667,6 +667,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
 	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
+	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 };
 
 /*
-- 
cgit v1.1


From 5fec6ddcb43a91aa9a254c8ecf174c803de6f07e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 19:59:53 -0400
Subject: tracing: make sched_switch stop/start light weight

The stopping and starting of a tracer should be light weight and
be able to be called in all contexts. The sched_switch grabbed
mutexes in the start/stop functions. This patch changes it to a
simple variable, on/off.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sched_switch.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 77132c2..de35f20 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,6 +18,7 @@ static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 static int			sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
+static int			sched_stopped;
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!sched_ref)
+	if (!sched_ref || sched_stopped)
 		return;
 
 	tracing_record_cmdline(prev);
@@ -193,6 +194,7 @@ static void stop_sched_trace(struct trace_array *tr)
 static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch_record();
 	return 0;
 }
@@ -205,13 +207,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 
 static void sched_switch_trace_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch();
+	sched_stopped = 0;
 }
 
 static void sched_switch_trace_stop(struct trace_array *tr)
 {
-	tracing_stop_sched_switch();
+	sched_stopped = 1;
 }
 
 static struct tracer sched_switch_trace __read_mostly =
-- 
cgit v1.1


From 62524d55e5b9ffe36e3bf3dd7a594114f150b449 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 20:58:00 -0400
Subject: tracing: make power tracer start/stop methods lighter weight

The start/stop methods of a tracer should be able to be executed
in all contexts. This patch converts the power tracer to do so.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_power.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 91ce672..bae791e 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -122,12 +122,16 @@ fail_start:
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
-	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
+}
+
+static void power_trace_reset(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
 	unregister_trace_power_start(probe_power_start);
 	unregister_trace_power_end(probe_power_end);
 	unregister_trace_power_mark(probe_power_mark);
@@ -188,7 +192,7 @@ static struct tracer power_tracer __read_mostly =
 	.init		= power_trace_init,
 	.start		= start_power_trace,
 	.stop		= stop_power_trace,
-	.reset		= stop_power_trace,
+	.reset		= power_trace_reset,
 	.print_line	= power_print_line,
 };
 
-- 
cgit v1.1


From 18aecd362a1c991fbf5f7919ae051a77532ba2f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 08:56:58 +0100
Subject: tracing: stop command line recording when tracing is disabled

Impact: prevent overwrite of command line entries

When the tracer is stopped the command line recording continues to
record. The check for tracing_is_on() is not sufficient here as the
ringbuffer status is not affected by setting
debug/tracing/tracing_enabled to 0. On a non idle system this can
result in the loss of the command line information for the stopped
trace, which makes the trace harder to read and analyse.

Check tracer_enabled to allow further recording.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ce6208..7b6043e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -797,7 +797,8 @@ void trace_find_cmdline(int pid, char comm[])
 
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+	if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
+	    !tracing_is_on())
 		return;
 
 	trace_save_cmdline(tsk);
-- 
cgit v1.1


From 2c7eea4c62ba090b7f4583c3d7337ea0019be900 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 09:03:19 +0100
Subject: tracing: replace the crude (unsigned) -1 hackery

Impact: cleanup

The command line recorder uses (unsigned) -1 to mark non mapped
entries in the pid to command line maps. The validity check is
completely unintuitive: idx >= SAVED_CMDLINES

There is no need for such casting games. Use a constant to mark
unmapped entries and check for that constant to make the code readable
and understandable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b6043e..ca673c4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -633,6 +633,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 }
 
 #define SAVED_CMDLINES 128
+#define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
@@ -644,8 +645,8 @@ static atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
-	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
-	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
 	cmdline_idx = 0;
 }
 
@@ -753,12 +754,12 @@ static void trace_save_cmdline(struct task_struct *tsk)
 		return;
 
 	idx = map_pid_to_cmdline[tsk->pid];
-	if (idx >= SAVED_CMDLINES) {
+	if (idx == NO_CMDLINE_MAP) {
 		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
 
 		map = map_cmdline_to_pid[idx];
-		if (map <= PID_MAX_DEFAULT)
-			map_pid_to_cmdline[map] = (unsigned)-1;
+		if (map != NO_CMDLINE_MAP)
+			map_pid_to_cmdline[map] = NO_CMDLINE_MAP;
 
 		map_pid_to_cmdline[tsk->pid] = idx;
 
@@ -786,7 +787,7 @@ void trace_find_cmdline(int pid, char comm[])
 
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
-	if (map >= SAVED_CMDLINES)
+	if (map == NO_CMDLINE_MAP)
 		goto out;
 
 	strcpy(comm, saved_cmdlines[map]);
-- 
cgit v1.1


From 50d88758a3f9787cbdbdbc030560b815721eab4b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 08:58:44 +0100
Subject: tracing: fix trace_find_cmdline()

Impact: prevent stale command line output

In case there is no valid command line mapping for a pid
trace_find_cmdline() returns without updating the comm buffer. The
trace dump keeps the previous entry which results in confusing trace
output:

     <idle>-0     [000]   280.702056 ....
     <idle>-23456 [000]   280.702080 ....

Update the comm buffer with "<...>" when no mapping is found.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca673c4..06c69a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -787,12 +787,11 @@ void trace_find_cmdline(int pid, char comm[])
 
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
-	if (map == NO_CMDLINE_MAP)
-		goto out;
-
-	strcpy(comm, saved_cmdlines[map]);
+	if (map != NO_CMDLINE_MAP)
+		strcpy(comm, saved_cmdlines[map]);
+	else
+		strcpy(comm, "<...>");
 
- out:
 	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
-- 
cgit v1.1


From a635cf0497342978d417cae19d4a4823932977ff Mon Sep 17 00:00:00 2001
From: Carsten Emde <Carsten.Emde@osadl.org>
Date: Wed, 18 Mar 2009 09:00:41 +0100
Subject: tracing: fix command line to pid reverse map

Impact: fix command line to pid mapping

map_cmdline_to_pid[] is checked in trace_save_cmdline(), but never
updated. This results in stale pid to command line mappings and the
tracer output will associate the wrong comm string.

Signed-off-by: Carsten Emde <Carsten.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06c69a2..305c562 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -738,8 +738,7 @@ void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
 {
-	unsigned map;
-	unsigned idx;
+	unsigned pid, idx;
 
 	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
 		return;
@@ -757,10 +756,17 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	if (idx == NO_CMDLINE_MAP) {
 		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
 
-		map = map_cmdline_to_pid[idx];
-		if (map != NO_CMDLINE_MAP)
-			map_pid_to_cmdline[map] = NO_CMDLINE_MAP;
+		/*
+		 * Check whether the cmdline buffer at idx has a pid
+		 * mapped. We are going to overwrite that entry so we
+		 * need to clear the map_pid_to_cmdline. Otherwise we
+		 * would read the new comm for the old pid.
+		 */
+		pid = map_cmdline_to_pid[idx];
+		if (pid != NO_CMDLINE_MAP)
+			map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
 
+		map_cmdline_to_pid[idx] = tsk->pid;
 		map_pid_to_cmdline[tsk->pid] = idx;
 
 		cmdline_idx = idx;
-- 
cgit v1.1


From 490362003457f8d387f6f6e73e3a7efbf56c3314 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 17 Mar 2009 22:38:58 +0100
Subject: tracing/ftrace: stop {irqs, preempt}soff tracers when tracing is
 stopped

Impact: fix a selftest warning

In some cases, it's possible to see the following warning on irqsoff
tracer selftest:

[    4.640003] Testing tracer irqsoff: <4>------------[ cut here ]------------
[    4.653562] WARNING: at kernel/trace/trace.c:458 update_max_tr_single+0x9a/0xc4()
[    4.660000] Hardware name: System Product Name
[    4.660000] Modules linked in:
[    4.660000] Pid: 301, comm: kstop/1 Not tainted 2.6.29-rc8-tip #35837
[    4.660000] Call Trace:
[    4.660000]  [<4014b588>] warn_slowpath+0x79/0x8f
[    4.660000]  [<402d6949>] ? put_dec+0x64/0x6b
[    4.660000]  [<40162b56>] ? getnstimeofday+0x58/0xdd
[    4.660000]  [<40162210>] ? clocksource_read+0x3/0xf
[    4.660000]  [<4015eb44>] ? ktime_set+0x8/0x34
[    4.660000]  [<4014101a>] ? balance_runtime+0x8/0x56
[    4.660000]  [<405f6f11>] ? _spin_lock+0x3/0x10
[    4.660000]  [<4011f643>] ? ftrace_call+0x5/0x8
[    4.660000]  [<4015d0f1>] ? task_cputime_zero+0x3/0x27
[    4.660000]  [<40190ee7>] ? cpupri_set+0x90/0xcb
[    4.660000]  [<405f7208>] ? _spin_lock_irqsave+0x22/0x34
[    4.660000]  [<40190f12>] ? cpupri_set+0xbb/0xcb
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<4018493f>] ? ring_buffer_reset_cpu+0x27/0x51
[    4.660000]  [<405f7208>] ? _spin_lock_irqsave+0x22/0x34
[    4.660000]  [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<4018cc29>] ? trace_hardirqs_off+0x1a/0x1c
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51
[    4.660000]  [<401850f3>] ? cpumask_next+0x15/0x18
[    4.660000]  [<4018a41f>] update_max_tr_single+0x9a/0xc4
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4018cd13>] check_critical_timing+0xcc/0x11e
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4018cdf1>] stop_critical_timing+0x8c/0x9f
[    4.660000]  [<4014e5c4>] ? forget_original_parent+0xac/0xd0
[    4.660000]  [<4018ce3a>] trace_hardirqs_on+0x1a/0x1c
[    4.660000]  [<4014e5c4>] forget_original_parent+0xac/0xd0
[    4.660000]  [<4014e5fe>] exit_notify+0x16/0xf2
[    4.660000]  [<4014e8a5>] do_exit+0x1cb/0x225
[    4.660000]  [<4015c72b>] ? kthread+0x0/0x69
[    4.660000]  [<4011f61d>] kernel_thread_helper+0xd/0x10
[    4.660000] ---[ end trace a7919e7f17c0a725 ]---
[    4.660164] .. no entries found ..FAILED!

During the selftest of irqsoff tracer, we do that:

	/* disable interrupts for a bit */
	local_irq_disable();
	udelay(100);
	local_irq_enable();
	/* stop the tracing. */
	tracing_stop();
	/* check both trace buffers */
	ret = trace_test_buffer(tr, NULL);

If a callsite performs a new max delay with irqs off just after
tracing_stop, update_max_tr_single() -> ring_buffer_swap_cpu()
will be called with the buffers disabled by tracing_stop(), hence
the warning, then ring_buffer_swap_cpu() return -EAGAIN and
update_max_tr_single() complains.

Fix it by also stopping the tracer before stopping the tracing globally.
A similar situation can happen with preemptoff and preemptirqsoff tracers
where we apply the same fix.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237325938-5240-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a2ca6f0..38856ba 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -315,6 +315,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	local_irq_disable();
 	udelay(100);
 	local_irq_enable();
+
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max irqs off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -369,6 +377,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	preempt_disable();
 	udelay(100);
 	preempt_enable();
+
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max preempt off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -428,6 +444,13 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* reverse the order of preempt vs irqs */
 	local_irq_enable();
 
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max irqs/preempt off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -448,6 +471,8 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* do the test by disabling interrupts first this time */
 	tracing_max_latency = 0;
 	tracing_start();
+	trace->start(tr);
+
 	preempt_disable();
 	local_irq_disable();
 	udelay(100);
@@ -455,6 +480,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* reverse the order of preempt vs irqs */
 	local_irq_enable();
 
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-- 
cgit v1.1


From f02b8624fedca39886b0eef770dca70c2f0749b3 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Wed, 18 Mar 2009 17:06:21 +0530
Subject: kprobes: Fix locking imbalance in kretprobes

Fix locking imbalance in kretprobes:

=====================================
[ BUG: bad unlock balance detected! ]
-------------------------------------
kthreadd/2 is trying to release lock (&rp->lock) at:
[<c06b3080>] pre_handler_kretprobe+0xea/0xf4
but there are no more locks to release!

other info that might help us debug this:
1 lock held by kthreadd/2:
 #0:  (rcu_read_lock){..--}, at: [<c06b2b24>] __atomic_notifier_call_chain+0x0/0x5a

stack backtrace:
Pid: 2, comm: kthreadd Not tainted 2.6.29-rc8 #1
Call Trace:
 [<c06ae498>] ? printk+0xf/0x17
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044ce6c>] print_unlock_inbalance_bug+0xc3/0xce
 [<c0444d4b>] ? clocksource_read+0x7/0xa
 [<c04450a4>] ? getnstimeofday+0x5f/0xf6
 [<c044a9ca>] ? register_lock_class+0x17/0x293
 [<c044b72c>] ? mark_lock+0x1e/0x30b
 [<c0448956>] ? tick_dev_program_event+0x4a/0xbc
 [<c0498100>] ? __slab_alloc+0xa5/0x415
 [<c06b2fbe>] ? pre_handler_kretprobe+0x28/0xf4
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044cf1b>] lock_release_non_nested+0xa4/0x1a5
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044d15d>] lock_release+0x141/0x166
 [<c06b07dd>] _spin_unlock_irqrestore+0x19/0x50
 [<c06b3080>] pre_handler_kretprobe+0xea/0xf4
 [<c06b20b5>] kprobe_exceptions_notify+0x1c9/0x43e
 [<c06b2b02>] notifier_call_chain+0x26/0x48
 [<c06b2b5b>] __atomic_notifier_call_chain+0x37/0x5a
 [<c06b2b24>] ? __atomic_notifier_call_chain+0x0/0x5a
 [<c06b2b8a>] atomic_notifier_call_chain+0xc/0xe
 [<c0442d0d>] notify_die+0x2d/0x2f
 [<c06b0f9c>] do_int3+0x1f/0x71
 [<c06b0e84>] int3+0x2c/0x34
 [<c042d476>] ? do_fork+0x1/0x288
 [<c040221b>] ? kernel_thread+0x71/0x79
 [<c043ed1b>] ? kthread+0x0/0x60
 [<c043ed1b>] ? kthread+0x0/0x60
 [<c04040b8>] ? kernel_thread_helper+0x0/0x10
 [<c043ec7f>] kthreadd+0xac/0x148
 [<c043ebd3>] ? kthreadd+0x0/0x148
 [<c04040bf>] kernel_thread_helper+0x7/0x10

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Tested-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@kernel.org> [2.6.29.x, 2.6.28.x, 2.6.27.x]
LKML-Reference: <20090318113621.GB4129@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 479d4d5..5016bfb 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -919,10 +919,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 		ri->rp = rp;
 		ri->task = current;
 
-		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			spin_unlock_irqrestore(&rp->lock, flags);
+		if (rp->entry_handler && rp->entry_handler(ri, regs))
 			return 0;
-		}
 
 		arch_prepare_kretprobe(ri, regs);
 
-- 
cgit v1.1


From 4acd4d00f716873e27e7b60ae292cbdbfae674dd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Mar 2009 10:40:24 -0400
Subject: tracing: give easy way to clear trace buffer

There is currently no easy way to clear the trace buffer. Currently
the only way is to change the current tracer.

This patch lets the user clear the trace buffer by simply writing
into the trace files.

 echo > /debug/tracing/trace

or to clear a single cpu (i.e. for CPU 1):

 echo > /debug/tracing/per_cpu/cpu1/trace

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2d13e8..8d981ab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1941,9 +1941,14 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
-	struct trace_iterator *iter = m->private;
+	struct trace_iterator *iter;
 	int cpu;
 
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	iter = m->private;
+
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu])
@@ -1969,12 +1974,24 @@ static int tracing_open(struct inode *inode, struct file *file)
 	struct trace_iterator *iter;
 	int ret = 0;
 
-	iter = __tracing_open(inode, file);
-	if (IS_ERR(iter))
-		ret = PTR_ERR(iter);
-	else if (trace_flags & TRACE_ITER_LATENCY_FMT)
-		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+	/* If this file was open for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND)) {
+		long cpu = (long) inode->i_private;
+
+		if (cpu == TRACE_PIPE_ALL_CPU)
+			tracing_reset_online_cpus(&global_trace);
+		else
+			tracing_reset(&global_trace, cpu);
+	}
 
+	if (file->f_mode & FMODE_READ) {
+		iter = __tracing_open(inode, file);
+		if (IS_ERR(iter))
+			ret = PTR_ERR(iter);
+		else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+			iter->iter_flags |= TRACE_FILE_LAT_FMT;
+	}
 	return ret;
 }
 
@@ -2049,9 +2066,17 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static ssize_t
+tracing_write_stub(struct file *filp, const char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	return count;
+}
+
 static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
+	.write		= tracing_write_stub,
 	.llseek		= seq_lseek,
 	.release	= tracing_release,
 };
@@ -3576,7 +3601,7 @@ static void tracing_init_debugfs_percpu(long cpu)
 		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
 
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0444, d_cpu,
+	entry = debugfs_create_file("trace", 0644, d_cpu,
 				(void *) cpu, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
@@ -3890,7 +3915,7 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
-	entry = debugfs_create_file("trace", 0444, d_tracer,
+	entry = debugfs_create_file("trace", 0644, d_tracer,
 				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
-- 
cgit v1.1


From 09933a108e6730a464a1ab676c9decc11aee0edc Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 18 Mar 2009 22:18:56 +0530
Subject: tracing: fix oops in tracepoint_update_probe_range()

Change this crash:

 BUG: unable to handle kernel NULL pointer dereference at (null)
 IP: [<ffffffff8107d4de>] tracepoint_update_probe_range+0x1f/0x9b
 PGD 13d5fb067 PUD 13d688067 PMD 0
 Oops: 0000 [#1] SMP

To a more debuggable WARN_ONCE().

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain>
[ moved the check outside the lock and added a WARN_ON(). ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 7960274..adf2873 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -272,12 +272,17 @@ static void disable_tracepoint(struct tracepoint *elem)
  *
  * Updates the probe callback corresponding to a range of tracepoints.
  */
-void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end)
+void
+tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 {
 	struct tracepoint *iter;
 	struct tracepoint_entry *mark_entry;
 
+	if (!begin) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_tracepoint(iter->name);
-- 
cgit v1.1


From ec625cb29e66824f7ce41082617aeb93fa4e42e2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 19:54:04 +0100
Subject: tracepoints: dont update zero-sized tracepoint sections

Zero-sized tracepoint sections can occur if tracing is enabled but
no tracepoint is defined. Do not emit a warning in that case.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index adf2873..1ef5d3a 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -278,10 +278,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 	struct tracepoint *iter;
 	struct tracepoint_entry *mark_entry;
 
-	if (!begin) {
-		WARN_ON_ONCE(1);
+	if (!begin)
 		return;
-	}
 
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
-- 
cgit v1.1


From df7c8e845e8e2030e8ae947e0ace56d184d0e9a0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Mar 2009 15:22:20 +1030
Subject: cpumask: remove cpumask allocation from idle_balance

Impact: fix circular locking

Steven reports a circular locking from alloc_cpumask_var doing
a wakeup. We get rid of this using the tried-and-true technique
of using a per-cpu cpumask_var_t rather than doing an alloc
every time.

Simpler and more robust than a rare, implicit allocation within
an atomic codepath.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0903181729360.31583@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5dabd80..48862d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3448,19 +3448,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 #define MAX_PINNED_INTERVAL	512
 
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance, struct cpumask *cpus)
+			int *balance)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
 	cpumask_setall(cpus);
 
@@ -3615,8 +3619,7 @@ out:
  * this_rq is locked.
  */
 static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-			struct cpumask *cpus)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
@@ -3624,6 +3627,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
 	cpumask_setall(cpus);
 
@@ -3764,10 +3768,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
-	cpumask_var_t tmpmask;
-
-	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-		return;
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -3778,7 +3778,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
-							   sd, tmpmask);
+							   sd);
 
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +3793,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		 */
 		this_rq->next_balance = next_balance;
 	}
-	free_cpumask_var(tmpmask);
 }
 
 /*
@@ -3943,11 +3942,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize;
-	cpumask_var_t tmp;
-
-	/* Fails alloc?  Rebalancing probably not a priority right now. */
-	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-		return;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3972,7 +3966,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -4006,8 +4000,6 @@ out:
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
-
-	free_cpumask_var(tmp);
 }
 
 /*
@@ -8304,6 +8296,9 @@ void __init sched_init(void)
 #ifdef CONFIG_USER_SCHED
 	alloc_size *= 2;
 #endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	alloc_size *= num_possible_cpus() * cpumask_size();
+#endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
 	 * we use alloc_bootmem().
@@ -8341,6 +8336,12 @@ void __init sched_init(void)
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+		for_each_possible_cpu(i) {
+			per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+			ptr += cpumask_size();
+		}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v1.1


From 4a44bac1f98223ed77e47bf3b42fcfd10cddd85f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Mar 2009 13:21:44 +0100
Subject: symbols, stacktrace: look up init symbols after module symbols

Impact: fix incomplete stacktraces

I noticed such weird stacktrace entries in lockdep dumps:

[    0.285956] {HARDIRQ-ON-W} state was registered at:
[    0.285956]   [<ffffffff802bce90>] mark_irqflags+0xbe/0x125
[    0.285956]   [<ffffffff802bf2fd>] __lock_acquire+0x674/0x82d
[    0.285956]   [<ffffffff802bf5b2>] lock_acquire+0xfc/0x128
[    0.285956]   [<ffffffff8135b636>] rt_spin_lock+0xc8/0xd0
[    0.285956]   [<ffffffffffffffff>] 0xffffffffffffffff

The stacktrace entry is cut off after rt_spin_lock.

After much debugging i found out that stacktrace entries that
belong to init symbols dont get printed out, due to commit:

  a2da405: module: Don't report discarded init pages as kernel text.

The reason is this check added to core_kernel_text():

-       if (addr >= (unsigned long)_sinittext &&
+       if (system_state == SYSTEM_BOOTING &&
+           addr >= (unsigned long)_sinittext &&
            addr <= (unsigned long)_einittext)
                return 1;

This will discard inittext symbols even though their symbol table
is still present and even though stacktraces done while the system
was booting up might still be relevant.

To not reintroduce the (not well-specified) bug addressed in that
commit, first do a module symbols lookup, then a final init-symbols
lookup.

This will work fine on architectures that have separate address
spaces for modules (such as x86) - and should not crash any other
architectures either.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <new-discussion>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8..c46da6a 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
+static inline int init_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sinittext &&
+	    addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
 __notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
@@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
 		return 1;
 
 	if (system_state == SYSTEM_BOOTING &&
-	    addr >= (unsigned long)_sinittext &&
-	    addr <= (unsigned long)_einittext)
+	    init_kernel_text(addr))
 		return 1;
 	return 0;
 }
@@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	if (__module_text_address(addr))
+		return 1;
+	/*
+	 * There might be init symbols in saved stacktraces.
+	 * Give those symbols a chance to be printed in
+	 * backtraces (such as lockdep traces).
+	 *
+	 * Since we are after the module-symbols check, there's
+	 * no danger of address overlap:
+	 */
+	if (init_kernel_text(addr))
+		return 1;
+	return 0;
 }
 
 int kernel_text_address(unsigned long addr)
-- 
cgit v1.1


From 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Mar 2009 15:22:20 +1030
Subject: cpumask: remove cpumask allocation from idle_balance, fix

Impact: fix boot crash

Fix typo in the size calculation.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0903181729360.31583@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 48862d4..11dd527 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,7 +8297,7 @@ void __init sched_init(void)
 	alloc_size *= 2;
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
-	alloc_size *= num_possible_cpus() * cpumask_size();
+	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
-- 
cgit v1.1


From ac5f6c96859e9a664ac05b04bc96ed1caad5fe29 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 11:29:23 -0400
Subject: function-graph: consolidate prologues for output

Impact: clean up

The prologue of the function graph entry, return and comments all
start out pretty much the same. Each of these duplicate code and
do so slightly differently.

This patch consolidates the printing of the pid, absolute time,
cpu and proc (and for entry, the interrupt).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 98 +++++++++++-------------------------
 1 file changed, 29 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6004cca..2d4d185 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -554,24 +554,24 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 }
 
 static enum print_line_t
-print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter)
+print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
+		     int type, unsigned long addr)
 {
-	int ret;
-	int cpu = iter->cpu;
-	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
-	struct ftrace_graph_ent *call = &field->graph_ent;
-	struct ftrace_graph_ret_entry *leaf_ret;
+	pid_t *last_pid = iter->private;
+	int cpu = iter->cpu;
+	int ret;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Interrupt */
-	ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (type) {
+		/* Interrupt */
+		ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Absolute time */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
@@ -598,6 +598,20 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
+	return 0;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+			struct trace_iterator *iter)
+{
+	int cpu = iter->cpu;
+	struct ftrace_graph_ent *call = &field->graph_ent;
+	struct ftrace_graph_ret_entry *leaf_ret;
+
+	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	leaf_ret = get_return_for_leaf(iter, field);
 	if (leaf_ret)
 		return print_graph_entry_leaf(iter, field, leaf_ret, s);
@@ -613,38 +627,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int i;
 	int ret;
 	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private, pid = ent->pid;
+	pid_t pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
-	/* Pid */
-	if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
-		ret = print_graph_abs_time(iter->ts, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, cpu);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " | ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
 	/* Overhead */
 	ret = print_graph_overhead(duration, s);
 	if (!ret)
@@ -689,38 +677,10 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 {
 	int i;
 	int ret;
-	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private;
 
-	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
-		ret = print_graph_abs_time(iter->ts, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, cpu);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " | ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
 	/* No overhead */
 	ret = print_graph_overhead(-1, s);
 	if (!ret)
-- 
cgit v1.1


From 3bf832ce1fe6988148d392599f34ca0c6a34427d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Mar 2009 14:47:33 +0100
Subject: tracing/ring-buffer: fix non cpu hotplug case

Impact: fix warning with irqsoff tracer

The ring buffer allocates its buffers on pre-smp time (early_initcall).
It means that, at first, only the boot cpu buffer is allocated and
the ring-buffer cpumask only has the boot cpu set (cpu_online_mask).

Later, the secondary cpu will show up and the ring-buffer will be notified
about this event: the appropriate buffer will be allocated and the cpumask
will be updated.

Unfortunately, if !CONFIG_CPU_HOTPLUG, the ring-buffer will not be
notified about the secondary cpus, meaning that the cpumask will have
only the cpu boot set, and only one cpu buffer allocated.

We fix that by using cpu_possible_mask if !CONFIG_CPU_HOTPLUG.

This patch fixes the following warning with irqsoff tracer running:

[  169.317794] WARNING: at kernel/trace/trace.c:466 update_max_tr_single+0xcc/0xf3()
[  169.318002] Hardware name: AMILO Li 2727
[  169.318002] Modules linked in:
[  169.318002] Pid: 5624, comm: bash Not tainted 2.6.29-rc8-tip-02636-g6aafa6c #11
[  169.318002] Call Trace:
[  169.318002]  [<ffffffff81036182>] warn_slowpath+0xea/0x13d
[  169.318002]  [<ffffffff8100b9d6>] ? ftrace_call+0x5/0x2b
[  169.318002]  [<ffffffff8100b9d6>] ? ftrace_call+0x5/0x2b
[  169.318002]  [<ffffffff8100b9d1>] ? ftrace_call+0x0/0x2b
[  169.318002]  [<ffffffff8101ef10>] ? ftrace_modify_code+0xa9/0x108
[  169.318002]  [<ffffffff8106e27f>] ? trace_hardirqs_off+0x25/0x27
[  169.318002]  [<ffffffff8149afe7>] ? _spin_unlock_irqrestore+0x1f/0x2d
[  169.318002]  [<ffffffff81064f52>] ? ring_buffer_reset_cpu+0xf6/0xfb
[  169.318002]  [<ffffffff8106637c>] ? ring_buffer_reset+0x36/0x48
[  169.318002]  [<ffffffff8106aeda>] update_max_tr_single+0xcc/0xf3
[  169.318002]  [<ffffffff8100bc17>] ? sysret_check+0x22/0x5d
[  169.318002]  [<ffffffff8106e3ea>] stop_critical_timing+0x142/0x204
[  169.318002]  [<ffffffff8106e4cf>] trace_hardirqs_on_caller+0x23/0x25
[  169.318002]  [<ffffffff8149ac28>] trace_hardirqs_on_thunk+0x3a/0x3c
[  169.318002]  [<ffffffff8100bc17>] ? sysret_check+0x22/0x5d
[  169.318002] ---[ end trace db76cbf775a750cf ]---

Because this tracer may try to swap two cpu ring buffers for an
unregistered cpu on the ring buffer.

This patch might also fix a fair loss of traces due to unallocated buffers
for secondary cpus.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-b: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237470453-5427-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bbf5192..384ca5d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -577,8 +577,17 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
+	/*
+	 * In case of non-hotplug cpu, if the ring-buffer is allocated
+	 * in early initcall, it will not be notified of secondary cpus.
+	 * In that off case, we need to allocate for all possible cpus.
+	 */
+#ifdef CONFIG_HOTPLUG_CPU
 	get_online_cpus();
 	cpumask_copy(buffer->cpumask, cpu_online_mask);
+#else
+	cpumask_copy(buffer->cpumask, cpu_possible_mask);
+#endif
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
-- 
cgit v1.1


From 5ef841f6f32dce0b752a4fa0622781ee67a0e874 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 12:20:38 -0400
Subject: tracing: make print_(b)printk_msg_only global

This patch makes print_printk_msg_only and print_bprintk_msg_only
global for other functions to use. It also renames them by adding
a "trace_" to the beginning to avoid namespace collisions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c        | 36 ++----------------------------------
 kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |  5 +++++
 3 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8d981ab..c637cb6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1694,38 +1694,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct bprint_entry *field;
-	int ret;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct print_entry *field;
-	int ret;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_printf(s, "%s", field->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
 static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1787,12 +1755,12 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	if (iter->ent->type == TRACE_BPRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
-		return print_bprintk_msg_only(iter);
+		return trace_print_bprintk_msg_only(iter);
 
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
-		return print_printk_msg_only(iter);
+		return trace_print_printk_msg_only(iter);
 
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6a4c9de..b451417 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -19,6 +19,38 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%s", field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 /**
  * trace_seq_printf - sequence printing of trace information
  * @s: trace sequence descriptor
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 3b90e6a..35c422f 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -15,6 +15,11 @@ struct trace_event {
 	trace_print_func	binary;
 };
 
+extern enum print_line_t
+trace_print_bprintk_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_printk_msg_only(struct trace_iterator *iter);
+
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
-- 
cgit v1.1


From 2fbcdb35aca614f9529a0e7d340146cf0b71684f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 13:24:42 -0400
Subject: function-graph: calculate function depth within function graph tracer

Currently, the function graph tracer depends on the trace_printk
to record the depth. All the information is already there in the trace
to calculate function depth, with the exception of having the printk
be the first item. But as soon as a entry or exit is reached, then
we know the depth.

This patch changes the iter->private data from recording a per cpu
last_pid, to a structure that holds both the last_pid and the current
depth. This data is used to determine the function depth for the
printks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 91 +++++++++++++++++++++++++++---------
 1 file changed, 69 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d4d185..66ea23b 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,6 +14,11 @@
 #include "trace.h"
 #include "trace_output.h"
 
+struct fgraph_data {
+	pid_t		last_pid;
+	int		depth;
+};
+
 #define TRACE_GRAPH_INDENT	2
 
 /* Flag options */
@@ -231,16 +236,16 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
 {
 	pid_t prev_pid;
 	pid_t *last_pid;
 	int ret;
 
-	if (!last_pids_cpu)
+	if (!data)
 		return TRACE_TYPE_HANDLED;
 
-	last_pid = per_cpu_ptr(last_pids_cpu, cpu);
+	last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
 
 	if (*last_pid == pid)
 		return TRACE_TYPE_HANDLED;
@@ -471,6 +476,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *entry,
 		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
 {
+	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ret *graph_ret;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
@@ -481,6 +487,18 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		/*
+		 * Comments display at + 1 to depth. Since
+		 * this is a leaf function, keep the comments
+		 * equal to this depth.
+		 */
+		*depth = call->depth - 1;
+	}
+
 	/* Overhead */
 	ret = print_graph_overhead(duration, s);
 	if (!ret)
@@ -512,12 +530,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 }
 
 static enum print_line_t
-print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
-			struct trace_seq *s, pid_t pid, int cpu)
+print_graph_entry_nested(struct trace_iterator *iter,
+			 struct ftrace_graph_ent_entry *entry,
+			 struct trace_seq *s, int cpu)
 {
-	int i;
-	int ret;
 	struct ftrace_graph_ent *call = &entry->graph_ent;
+	struct fgraph_data *data = iter->private;
+	int ret;
+	int i;
+
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		*depth = call->depth;
+	}
 
 	/* No overhead */
 	ret = print_graph_overhead(-1, s);
@@ -557,13 +584,13 @@ static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 		     int type, unsigned long addr)
 {
+	struct fgraph_data *data = iter->private;
 	struct trace_entry *ent = iter->ent;
-	pid_t *last_pid = iter->private;
 	int cpu = iter->cpu;
 	int ret;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	if (type) {
@@ -616,7 +643,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	if (leaf_ret)
 		return print_graph_entry_leaf(iter, field, leaf_ret, s);
 	else
-		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
+		return print_graph_entry_nested(iter, field, s, cpu);
 
 }
 
@@ -624,11 +651,24 @@ static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
-	int i;
-	int ret;
-	int cpu = iter->cpu;
-	pid_t pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
+	struct fgraph_data *data = iter->private;
+	pid_t pid = ent->pid;
+	int cpu = iter->cpu;
+	int ret;
+	int i;
+
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		/*
+		 * Comments display at + 1 to depth. This is the
+		 * return from a function, we now want the comments
+		 * to display at the same level of the bracket.
+		 */
+		*depth = trace->depth - 1;
+	}
 
 	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -675,8 +715,13 @@ static enum print_line_t
 print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
-	int i;
+	struct fgraph_data *data = iter->private;
+	int depth = 0;
 	int ret;
+	int i;
+
+	if (data)
+		depth = per_cpu_ptr(data, iter->cpu)->depth;
 
 	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -694,8 +739,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 	}
 
 	/* Indentation */
-	if (trace->depth > 0)
-		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+	if (depth > 0)
+		for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
 			ret = trace_seq_printf(s, " ");
 			if (!ret)
 				return TRACE_TYPE_PARTIAL_LINE;
@@ -780,19 +825,21 @@ static void print_graph_headers(struct seq_file *s)
 
 static void graph_trace_open(struct trace_iterator *iter)
 {
-	/* pid on the last trace processed */
-	pid_t *last_pid = alloc_percpu(pid_t);
+	/* pid and depth on the last trace processed */
+	struct fgraph_data *data = alloc_percpu(struct fgraph_data);
 	int cpu;
 
-	if (!last_pid)
+	if (!data)
 		pr_warning("function graph tracer: not enough memory\n");
 	else
 		for_each_possible_cpu(cpu) {
-			pid_t *pid = per_cpu_ptr(last_pid, cpu);
+			pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+			int *depth = &(per_cpu_ptr(data, cpu)->depth);
 			*pid = -1;
+			*depth = 0;
 		}
 
-	iter->private = last_pid;
+	iter->private = data;
 }
 
 static void graph_trace_close(struct trace_iterator *iter)
-- 
cgit v1.1


From 40ce74f19c28077550646c76d96a075bf312e461 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 14:03:53 -0400
Subject: tracing: remove recording function depth from trace_printk

The function depth in trace_printk was to facilitate the function
graph output. Now that the function graph calculates the depth within
the trace output, we no longer need to record the depth when the
trace_printk is called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c             | 8 +++-----
 kernel/trace/trace.h             | 6 ++----
 kernel/trace/trace_event_types.h | 2 --
 kernel/trace/trace_mmiotrace.c   | 2 +-
 kernel/trace/trace_printk.c      | 8 ++++----
 5 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c637cb6..f7f359d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1194,7 +1194,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
  * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1236,7 +1236,6 @@ int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
-	entry->depth			= depth;
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
@@ -1254,7 +1253,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
@@ -1291,7 +1290,6 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
-	entry->depth			= depth;
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
@@ -3140,7 +3138,7 @@ static int mark_printk(const char *fmt, ...)
 	int ret;
 	va_list args;
 	va_start(args, fmt);
-	ret = trace_vprintk(0, -1, fmt, args);
+	ret = trace_vprintk(0, fmt, args);
 	va_end(args);
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 38276d1..7c9a0cb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -123,7 +123,6 @@ struct userstack_entry {
 struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			depth;
 	const char		*fmt;
 	u32			buf[];
 };
@@ -131,7 +130,6 @@ struct bprint_entry {
 struct print_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			depth;
 	char			buf[];
 };
 
@@ -598,9 +596,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
-trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
-trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 0199150..fd78bee 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -105,7 +105,6 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
@@ -115,7 +114,6 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f095916..8e37fcd 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -359,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 
 int mmio_trace_printk(const char *fmt, va_list args)
 {
-	return trace_vprintk(0, -1, fmt, args);
+	return trace_vprintk(0, fmt, args);
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 4867852..eb81556 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -112,7 +112,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, fmt, ap);
 	va_end(ap);
 	return ret;
 }
@@ -126,7 +126,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	return trace_vbprintk(ip, fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
 
@@ -139,7 +139,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vprintk(ip, fmt, ap);
 	va_end(ap);
 	return ret;
 }
@@ -150,7 +150,7 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	return trace_vprintk(ip, fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-- 
cgit v1.1


From 5087f8d2a2f2daff5a913d72d8ea3ad601948e10 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 15:14:46 -0400
Subject: function-graph: show binary events as comments

With the added TRACE_EVENT macro, the events no longer appear in
the function graph tracer. This was because the function graph
did not know how to display the entries. The graph tracer was
only aware of its own entries and the printk entries.

By using the event call back feature, the graph tracer can now display
the events.

 # echo irq > /debug/tracing/set_event

Which can show:

 0)               |          handle_IRQ_event() {
 0)               |            /* irq_handler_entry: irq=48 handler=eth0 */
 0)               |            e1000_intr() {
 0)   0.926 us    |              __napi_schedule();
 0)   3.888 us    |            }
 0)               |            /* irq_handler_exit: irq=48 return=handled */
 0)   0.655 us    |            runqueue_is_locked();
 0)               |            __wake_up() {
 0)   0.831 us    |              _spin_lock_irqsave();

The irq entry and exit events show up as comments.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 40 +++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66ea23b..e876816 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -712,10 +712,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
-		   struct trace_entry *ent, struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
+		    struct trace_iterator *iter)
 {
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct fgraph_data *data = iter->private;
+	struct trace_event *event;
 	int depth = 0;
 	int ret;
 	int i;
@@ -751,9 +753,26 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	switch (iter->ent->type) {
+	case TRACE_BPRINT:
+		ret = trace_print_bprintk_msg_only(iter);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+		break;
+	case TRACE_PRINT:
+		ret = trace_print_printk_msg_only(iter);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+		break;
+	default:
+		event = ftrace_find_event(ent->type);
+		if (!event)
+			return TRACE_TYPE_UNHANDLED;
+
+		ret = event->trace(iter, sym_flags);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+	}
 
 	/* Strip ending newline */
 	if (s->buffer[s->len - 1] == '\n') {
@@ -772,8 +791,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
 
 	switch (entry->type) {
 	case TRACE_GRAPH_ENT: {
@@ -786,14 +805,11 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_BPRINT: {
-		struct bprint_entry *field;
-		trace_assign_type(field, entry);
-		return print_graph_comment(field, s, entry, iter);
-	}
 	default:
-		return TRACE_TYPE_UNHANDLED;
+		return print_graph_comment(s, entry, iter);
 	}
+
+	return TRACE_TYPE_HANDLED;
 }
 
 static void print_graph_headers(struct seq_file *s)
-- 
cgit v1.1


From 23725aeeab10ba02bcf10ec49ad73146b54cb52f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:13 +0100
Subject: ftrace: provide an id file for each event

Since not every event has a format file to read the id from,
expose it explicitly in a separate file.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.372534033@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c88227b..7763db8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -412,6 +412,29 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return r;
 }
 
+static ssize_t
+event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+	trace_seq_printf(s, "%d\n", call->id);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -452,6 +475,11 @@ static const struct file_operations ftrace_event_format_fops = {
 	.read = event_format_read,
 };
 
+static const struct file_operations ftrace_event_id_fops = {
+	.open = tracing_open_generic,
+	.read = event_id_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -550,6 +578,14 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
+	if (call->id) {
+		entry = debugfs_create_file("id", 0444, call->dir, call,
+				&ftrace_event_id_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs '%s/id' entry\n",
+					call->name);
+	}
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
-- 
cgit v1.1


From 28bea271e58e429eccfad3d7ee2ad12d6ee015bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:14 +0100
Subject: ftrace: ensure every event gets an id

Impact: widen user-space visibe event IDs to all events

Previously only TRACE_EVENT events got ids, because only they
generated raw output which needs to be demuxed from the trace.

In order to provide a unique ID for each event, register everybody,
regardless.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.464914218@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_3.h | 15 ++++++++++++++-
 kernel/trace/trace_output.c         |  5 +++++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index ae2e323..4c26d97 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -130,7 +130,19 @@ static void ftrace_unreg_event_##call(void)				\
 {									\
 	unregister_trace_##call(ftrace_event_##call);			\
 }									\
-
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static int ftrace_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(NULL);				\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}
 
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)				\
@@ -140,6 +152,7 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_init_event_##call,		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b451417..19261fd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -481,6 +481,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
+	if (!event) {
+		ret = next_event_type++;
+		goto out;
+	}
+
 	if (!event->type)
 		event->type = next_event_type++;
 	else if (event->type > __TRACE_LAST_TYPE) {
-- 
cgit v1.1


From ac199db0189c091f2863312061c0575937f68810 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:15 +0100
Subject: ftrace: event profile hooks

Impact: new tracing infrastructure feature

Provide infrastructure to generate software perf counter events
from tracepoints.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.557364871@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile               |  1 +
 kernel/trace/events.c               |  1 -
 kernel/trace/trace.h                | 11 ++++++++++
 kernel/trace/trace_event_profile.c  | 31 ++++++++++++++++++++++++++
 kernel/trace/trace_events.c         |  9 ++------
 kernel/trace/trace_events_stage_3.h | 44 +++++++++++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 8 deletions(-)
 create mode 100644 kernel/trace/trace_event_profile.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c3feea0..0e45c20 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 9fc918d..246f2aa 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -12,4 +12,3 @@
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
 
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7c9a0cb..7cfb741 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -785,12 +785,23 @@ struct ftrace_event_call {
 	int		id;
 	int		(*raw_init)(void);
 	int		(*show_format)(struct trace_seq *s);
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
 };
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
new file mode 100644
index 0000000..22cba99
--- /dev/null
+++ b/kernel/trace/trace_event_profile.c
@@ -0,0 +1,31 @@
+/*
+ * trace event based perf counter profiling
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ */
+
+#include "trace.h"
+
+int ftrace_profile_enable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	for_each_event(event) {
+		if (event->id == event_id)
+			return event->profile_enable(event);
+	}
+
+	return -EINVAL;
+}
+
+void ftrace_profile_disable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	for_each_event(event) {
+		if (event->id == event_id)
+			return event->profile_disable(event);
+	}
+}
+
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7763db8..3047b56 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,11 +19,6 @@
 
 static DEFINE_MUTEX(event_mutex);
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -90,7 +85,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	events_for_each(call) {
+	for_each_event(call) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -628,7 +623,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	events_for_each(call) {
+	for_each_event(call) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 4c26d97..6b3261c 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -109,6 +109,40 @@
 #undef TP_FMT
 #define TP_FMT(fmt, args...)	fmt "\n", ##args
 
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)				\
+static void ftrace_profile_##call(proto)				\
+{									\
+	extern void perf_tpcounter_event(int);				\
+	perf_tpcounter_event(event_##call.id);				\
+}									\
+									\
+static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
+{									\
+	int ret = 0;							\
+									\
+	if (!atomic_inc_return(&call->profile_count))			\
+		ret = register_trace_##call(ftrace_profile_##call);	\
+									\
+	return ret;							\
+}									\
+									\
+static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
+{									\
+	if (atomic_add_negative(-1, &call->profile_count))		\
+		unregister_trace_##call(ftrace_profile_##call);		\
+}
+
+#define _TRACE_PROFILE_INIT(call)					\
+	.profile_count = ATOMIC_INIT(-1),				\
+	.profile_enable = ftrace_profile_enable_##call,			\
+	.profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
@@ -147,6 +181,7 @@ static int ftrace_init_event_##call(void)				\
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)				\
 _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
@@ -155,6 +190,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_init_event_##call,		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
+	_TRACE_PROFILE_INIT(call)					\
 }
 
 #undef __entry
@@ -162,6 +198,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -227,4 +264,11 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
+	_TRACE_PROFILE_INIT(call)					\
 }
+
+#include <trace/trace_event_types.h>
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+
-- 
cgit v1.1


From 505f2b970b2269ce4cb669b3ff4f6479d379cec2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 20 Mar 2009 11:05:04 +0100
Subject: tracing, Text Edit Lock - kprobes architecture independent support,
 nommu fix

Impact: build fix on SH !CONFIG_MMU

Stephen Rothwell reported this linux-next build failure on the SH
architecture:

  kernel/built-in.o: In function `disable_all_kprobes':
  kernel/kprobes.c:1382: undefined reference to `text_mutex'
  [...]

And observed:

| Introduced by commit 4460fdad85becd569f11501ad5b91814814335ff ("tracing,
| Text Edit Lock - kprobes architecture independent support") from the
| tracing tree.  text_mutex is defined in mm/memory.c which is only built
| if CONFIG_MMU is defined, which is not true for sh allmodconfig.

Move this lock to kernel/extable.c (which is already home to various
kernel text related routines), which file is always built-in.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <20090320110602.86351a91.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 0df6253..25d39b0 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -15,11 +15,21 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/ftrace.h>
-#include <asm/uaccess.h>
+
 #include <asm/sections.h>
+#include <asm/uaccess.h>
+
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
 
 extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
-- 
cgit v1.1


From 09c9e84d474d917d9de5b9011ed2064b03a19677 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 21 Mar 2009 04:33:36 +0100
Subject: tracing/ring-buffer: don't annotate rb_cpu_notify with __cpuinit

Impact: remove a section warning

CONFIG_DEBUG_SECTION_MISMATCH raises the following warning on -tip:

  WARNING: kernel/trace/built-in.o(.text+0x5bc5): Section mismatch in
  reference from the function ring_buffer_alloc() to the function
  .cpuinit.text:rb_cpu_notify()
  The function ring_buffer_alloc() references
  the function __cpuinit rb_cpu_notify().

This is actually harmless. The code in the ring buffer don't build
rb_cpu_notify and other cpu hotplug stuffs when !CONFIG_HOTPLUG_CPU
so we have no risk to reference freed memory here (it would even
be harmless if we unconditionally build it because register_cpu_notifier
would do nothing when !CONFIG_HOTPLUG_CPU.

But since ring_buffer_alloc() can be called everytime, we don't want it
to be annotated with __cpuinit so we drop the __cpuinit from
rb_cpu_notify.

This is not a waste of memory because it is only defined and used on
CONFIG_HOTPLUG_CPU.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237606416-22268-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d..808b14b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -535,8 +535,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 extern int ring_buffer_page_too_big(void);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __cpuinit rb_cpu_notify(struct notifier_block *self,
-				   unsigned long action, void *hcpu);
+static int rb_cpu_notify(struct notifier_block *self,
+			 unsigned long action, void *hcpu);
 #endif
 
 /**
@@ -2784,8 +2784,8 @@ static __init int rb_init_debugfs(void)
 fs_initcall(rb_init_debugfs);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __cpuinit rb_cpu_notify(struct notifier_block *self,
-				   unsigned long action, void *hcpu)
+static int rb_cpu_notify(struct notifier_block *self,
+			 unsigned long action, void *hcpu)
 {
 	struct ring_buffer *buffer =
 		container_of(self, struct ring_buffer, cpu_notify);
-- 
cgit v1.1


From 1a17662ea033674a58bad3603531b0b5d42572f6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:47:30 +0800
Subject: blktrace: fix possible memory leak

When we failed to create "block" debugfs dir, we should do some
cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5B2.8000800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b171778..fb3bc53 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -432,7 +432,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
 		if (!blk_tree_root)
-			return -ENOMEM;
+			goto err;
 	}
 
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
-- 
cgit v1.1


From 5006ea73f38caef6065d1136808413813271633f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:03 +0800
Subject: blktrace: make blk_tracer_enabled a bool flag

It doesn't have to be a counter, and it can be a bool flag instead.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C2F5D3.8090104@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb3bc53..73845b7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -30,7 +30,7 @@
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
-static int __read_mostly  blk_tracer_enabled;
+static bool blk_tracer_enabled __read_mostly;
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
@@ -1111,9 +1111,7 @@ static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled++;
-	mutex_unlock(&blk_probe_mutex);
+	blk_tracer_enabled = true;
 	return 0;
 }
 
@@ -1131,11 +1129,7 @@ static void blk_tracer_reset(struct trace_array *tr)
 	if (!atomic_read(&blk_probes_ref))
 		return;
 
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled--;
-	WARN_ON(blk_tracer_enabled < 0);
-	mutex_unlock(&blk_probe_mutex);
-
+	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
 
-- 
cgit v1.1


From 3c289ba7c320560ee74979a8895141c829046a2d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:26 +0800
Subject: blktrace: remove blk_probe_mutex

blk_register_tracepoints() always returns 0, so make it return void,
thus we don't need to use blk_probe_mutex to protect blk_probes_ref.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5EA.8060606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 73845b7..223b92e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static DEFINE_MUTEX(blk_probe_mutex);
 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
 
-static int blk_register_tracepoints(void);
+static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
 
 /*
@@ -256,10 +255,8 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -471,13 +468,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1) {
-		ret = blk_register_tracepoints();
-		if (ret)
-			goto probe_err;
-	}
-	mutex_unlock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		blk_register_tracepoints();
 
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
@@ -487,9 +479,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 
 	return 0;
-probe_err:
-	atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
 err:
 	if (bt) {
 		if (bt->msg_file)
@@ -863,7 +852,7 @@ void blk_add_driver_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-static int blk_register_tracepoints(void)
+static void blk_register_tracepoints(void)
 {
 	int ret;
 
@@ -901,7 +890,6 @@ static int blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_remap(blk_add_trace_remap);
 	WARN_ON(ret);
-	return 0;
 }
 
 static void blk_unregister_tracepoints(void)
@@ -1099,11 +1087,8 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		if (blk_register_tracepoints())
-			atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
+		blk_register_tracepoints();
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1118,10 +1103,8 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
-- 
cgit v1.1


From cbe28296eb1ac441b35cf45804d0ae808add7dd1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:47 +0800
Subject: blktrace: don't increase blk_probes_ref if failed to setup blk trace

do_blk_trace_setup() may return EBUSY, but the current code
doesn't decrease blk_probes_ref in this case.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5FF.80002@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 223b92e..11e7c8d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -468,9 +468,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		blk_register_tracepoints();
-
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
@@ -478,6 +475,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 	}
 
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		blk_register_tracepoints();
+
 	return 0;
 err:
 	if (bt) {
-- 
cgit v1.1


From 15152e448b693fa41de40f1e40ffbe717a3aab88 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:49:08 +0800
Subject: blktrace: report EBUSY correctly

blk_trace_remove_queue() returns EINVAL if q->blk_trace == NULL,
but blk_trace_setup_queue() doesn't return EBUSY if
q->blk_trace != NULL.

 # echo 0 > sdaX/trace/enable
 # echo 0 > sdaX/trace/enable
 bash: echo: write error: Invalid argument
 # echo 1 > sdaX/trace/enable
 # echo 1 > sdaX/trace/enable
 (should return EBUSY)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F614.2010101@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 11e7c8d..14986af 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1260,12 +1260,10 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
-	int ret;
 
-	ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
-		goto err;
+		return -ENOMEM;
 
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
@@ -1276,11 +1274,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
 		kfree(bt);
-		ret = -EBUSY;
+		return -EBUSY;
 	}
+
 	return 0;
-err:
-	return ret;
 }
 
 /*
-- 
cgit v1.1


From cd649b8bb830d65c57c3c8b98d57b5402256d8bd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 11:33:55 +0800
Subject: blktrace: remove sysfs_blk_trace_enable_show/store()

sysfs_blk_trace_enable_show()/store() share most of code with
sysfs_blk_trace_attr_show()/store().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C30EA3.1060004@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 92 ++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 14986af..dfee6f9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1284,72 +1284,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
  * sysfs interface to enable and configure tracing
  */
 
-static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev != NULL) {
-		struct request_queue *q = bdev_get_queue(bdev);
-
-		if (q != NULL) {
-			mutex_lock(&bdev->bd_mutex);
-			ret = sprintf(buf, "%u\n", !!q->blk_trace);
-			mutex_unlock(&bdev->bd_mutex);
-		}
-
-		bdput(bdev);
-	}
-
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	int value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0 || sscanf(buf, "%d", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	if (value)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-	else
-		ret = blk_trace_remove_queue(q);
-	mutex_unlock(&bdev->bd_mutex);
-
-	if (ret == 0)
-		ret = count;
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
-
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf);
@@ -1361,8 +1295,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 		    sysfs_blk_trace_attr_show, \
 		    sysfs_blk_trace_attr_store)
 
-static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
-		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(enable);
 static BLK_TRACE_DEVICE_ATTR(act_mask);
 static BLK_TRACE_DEVICE_ATTR(pid);
 static BLK_TRACE_DEVICE_ATTR(start_lba);
@@ -1447,6 +1380,12 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 	mutex_lock(&bdev->bd_mutex);
+
+	if (attr == &dev_attr_enable) {
+		ret = sprintf(buf, "%u\n", !!q->blk_trace);
+		goto out_unlock_bdev;
+	}
+
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
@@ -1457,6 +1396,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
 	else if (attr == &dev_attr_end_lba)
 		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+
+out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
@@ -1499,6 +1440,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 		goto out_bdput;
 
 	mutex_lock(&bdev->bd_mutex);
+
+	if (attr == &dev_attr_enable) {
+		if (value)
+			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		else
+			ret = blk_trace_remove_queue(q);
+		goto out_unlock_bdev;
+	}
+
 	ret = 0;
 	if (q->blk_trace == NULL)
 		ret = blk_trace_setup_queue(q, bdev->bd_dev);
@@ -1512,13 +1462,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 			q->blk_trace->start_lba = value;
 		else if (attr == &dev_attr_end_lba)
 			q->blk_trace->end_lba = value;
-		ret = count;
 	}
+
+out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
 out_unlock_kernel:
 	unlock_kernel();
 out:
-	return ret;
+	return ret ? ret : count;
 }
+
-- 
cgit v1.1


From b125130b22d67f249beba10b71a254558b5279d0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 10:34:00 +0800
Subject: blktrace: avoid accessing NULL bdev->bd_disk

bdev->bd_disk can be NULL, if the block device is not opened.

Try this against an unmounted partition, and you'll see NULL dereference:

  # echo 1 > /sys/block/sda/sda5/enable

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C30098.6080107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index dfee6f9..108f4f7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1362,6 +1362,14 @@ static int blk_str2act_mask(const char *str)
 	return mask;
 }
 
+static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
+{
+	if (bdev->bd_disk == NULL)
+		return NULL;
+
+	return bdev_get_queue(bdev);
+}
+
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1376,9 +1384,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (bdev == NULL)
 		goto out_unlock_kernel;
 
-	q = bdev_get_queue(bdev);
+	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
+
 	mutex_lock(&bdev->bd_mutex);
 
 	if (attr == &dev_attr_enable) {
@@ -1435,7 +1444,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (bdev == NULL)
 		goto out_unlock_kernel;
 
-	q = bdev_get_queue(bdev);
+	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
 
-- 
cgit v1.1


From cf586b61f80229491127d3c57c06ed93c9f530d3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 05:04:35 +0100
Subject: tracing/function-graph-tracer: prevent hangs during self-tests

Impact: detect tracing related hangs

Sometimes, with some configs, the function graph tracer can make
the timer interrupt too much slow, hanging the kernel in an endless
loop of timer interrupts servicing.

As suggested by Ingo, this patch brings a watchdog which stops the
selftest after a defined number of functions traced, definitely
disabling this tracer.

For those who want to debug the cause of the function graph trace
hang, you can pass the ftrace_dump_on_oops kernel parameter to dump
the traces after this hang detection.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 26 +++++++++++++++++++++++---
 kernel/trace/trace_selftest.c | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e3dfefe..e6fac0f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4018,11 +4018,12 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
-void ftrace_dump(void)
+static void __ftrace_dump(bool disable_tracing)
 {
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
+	unsigned int old_userobj;
 	static int dump_ran;
 	unsigned long flags;
 	int cnt = 0, cpu;
@@ -4034,14 +4035,17 @@ void ftrace_dump(void)
 
 	dump_ran = 1;
 
-	/* No turning back! */
 	tracing_off();
-	ftrace_kill();
+
+	if (disable_tracing)
+		ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&global_trace.data[cpu]->disabled);
 	}
 
+	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
@@ -4086,10 +4090,26 @@ void ftrace_dump(void)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
+	/* Re-enable tracing if requested */
+	if (!disable_tracing) {
+		trace_flags |= old_userobj;
+
+		for_each_tracing_cpu(cpu) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+		tracing_on();
+	}
+
  out:
 	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
 }
 
+/* By default: disable tracing after the dump */
+void ftrace_dump(void)
+{
+	__ftrace_dump(true);
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 38856ba..b56dcf7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -248,6 +248,28 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Maximum number of functions to trace before diagnosing a hang */
+#define GRAPH_MAX_FUNC_TEST	100000000
+
+static void __ftrace_dump(bool disable_tracing);
+static unsigned int graph_hang_thresh;
+
+/* Wrap the real function entry probe to avoid possible hanging */
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+{
+	/* This is harmlessly racy, we want to approximately detect a hang */
+	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
+		ftrace_graph_stop();
+		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
+		if (ftrace_dump_on_oops)
+			__ftrace_dump(false);
+		return 0;
+	}
+
+	return trace_graph_entry(trace);
+}
+
 /*
  * Pretty much the same than for the function tracer from which the selftest
  * has been borrowed.
@@ -259,15 +281,29 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	int ret;
 	unsigned long count;
 
-	ret = tracer_init(trace, tr);
+	/*
+	 * Simulate the init() callback but we attach a watchdog callback
+	 * to detect and recover from possible hangs
+	 */
+	tracing_reset_online_cpus(tr);
+	ret = register_ftrace_graph(&trace_graph_return,
+				    &trace_graph_entry_watchdog);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
 	}
+	tracing_start_cmdline_record();
 
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 
+	/* Have we just recovered from a hang? */
+	if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
+		trace->reset(tr);
+		ret = -1;
+		goto out;
+	}
+
 	tracing_stop();
 
 	/* check the trace buffer */
-- 
cgit v1.1


From 0cf53ff62b3e9e491ff5e5f05b193fb6ce643047 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 15:13:07 +0100
Subject: tracing: keep the tracing buffer after self-test failure

Instead of using ftrace_dump_on_oops, it's far more convenient
to have the trace leading up to a self-test failure available
in /debug/tracing/trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b56dcf7..08f4eb2 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -299,7 +299,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 
 	/* Have we just recovered from a hang? */
 	if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
-		trace->reset(tr);
+		tracing_selftest_disabled = true;
 		ret = -1;
 		goto out;
 	}
-- 
cgit v1.1


From cf027f645e6aee4f0ca6197a6b6a57f327fdb13f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:39 -0500
Subject: tracing: add run-time field descriptions for event filtering

This patch makes the field descriptions defined for event tracing
available at run-time, for the event-filtering mechanism introduced
in a subsequent patch.

The common event fields are prepended with 'common_' in the format
display, allowing them to be distinguished from the other fields
that might internally have same name and can therefore be
unambiguously used in filters.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710639.7703.46.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                | 30 +++++++++++++++++--------
 kernel/trace/trace_events.c         | 40 ++++++++++++++++++++++++++++++++-
 kernel/trace/trace_events_stage_2.h | 45 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |  2 ++
 4 files changed, 107 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7cfb741..9288dc7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -775,16 +775,26 @@ enum {
 	TRACE_EVENT_TYPE_RAW		= 2,
 };
 
+struct ftrace_event_field {
+	struct list_head	link;
+	char			*name;
+	char			*type;
+	int			offset;
+	int			size;
+};
+
 struct ftrace_event_call {
-	char		*name;
-	char		*system;
-	struct dentry	*dir;
-	int		enabled;
-	int		(*regfunc)(void);
-	void		(*unregfunc)(void);
-	int		id;
-	int		(*raw_init)(void);
-	int		(*show_format)(struct trace_seq *s);
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
@@ -793,6 +803,8 @@ struct ftrace_event_call {
 #endif
 };
 
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size);
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3047b56..961b057 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,34 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size)
+{
+	struct ftrace_event_field *field;
+
+	field = kmalloc(sizeof(*field), GFP_KERNEL);
+	if (!field)
+		goto err;
+	field->name = kstrdup(name, GFP_KERNEL);
+	if (!field->name)
+		goto err;
+	field->type = kstrdup(type, GFP_KERNEL);
+	if (!field->type)
+		goto err;
+	field->offset = offset;
+	field->size = size;
+	list_add(&field->link, &call->fields);
+
+	return 0;
+err:
+	if (field) {
+		kfree(field->name);
+		kfree(field->type);
+	}
+	kfree(field);
+	return -ENOMEM;
+}
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -343,7 +371,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 #undef FIELD
 #define FIELD(type, name)						\
-	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+	#type, "common_" #name, offsetof(typeof(field), name),		\
+		sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -581,6 +610,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 					call->name);
 	}
 
+	if (call->define_fields) {
+		ret = call->define_fields();
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
+		}
+	}
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 5117c43..30743f7 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s)				\
 }
 
 #include <trace/trace_event_types.h>
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_raw_##call field;					\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6b3261c..468938f 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -252,6 +252,7 @@ static int ftrace_raw_init_event_##call(void)				\
 	if (!id)							\
 		return -ENODEV;						\
 	event_##call.id = id;						\
+	INIT_LIST_HEAD(&event_##call.fields);				\
 	return 0;							\
 }									\
 									\
@@ -264,6 +265,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-- 
cgit v1.1


From f80d2d7725b04f8225b11b55e43bb2c77c819926 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:10 +0200
Subject: tracing, Text Edit Lock: Fix one sparse warning in kernel/extable.c

Impact: cleanup.

The global mutex text_mutex if declared in linux/memory.h, so
this file needs to be included into kernel/extable.c, where the
same mutex is defined. This fixes the following sparse warning:

 kernel/extable.c:32:1: warning: symbol 'text_mutex' was not declared.
 Should it be static?

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-3-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 25d39b0..b54a601 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -16,6 +16,7 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/ftrace.h>
+#include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-- 
cgit v1.1


From b8b94265337f83b7db9c5f429b1769d463d7da8c Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:11 +0200
Subject: tracing: fix four sparse warnings

Impact: cleanup.

This patch fixes the following sparse warnings:

 kernel/trace/trace.c:385:9: warning: symbol 'trace_seq_to_buffer' was
 not declared. Should it be static?

 kernel/trace/trace_clock.c:29:13: warning: symbol 'trace_clock_local'
 was not declared. Should it be static?

 kernel/trace/trace_clock.c:54:13: warning: symbol 'trace_clock' was not
 declared. Should it be static?

 kernel/trace/trace_clock.c:74:13: warning: symbol 'trace_clock_global'
 was not declared. Should it be static?

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-4-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c       | 2 +-
 kernel/trace/trace_clock.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6fac0f..ace685c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -382,7 +382,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	return cnt;
 }
 
-ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
 	int len;
 	void *ret;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 05b176a..b588fd8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/ktime.h>
+#include <linux/trace_clock.h>
 
 /*
  * trace_clock_local(): the simplest and least coherent tracing clock.
-- 
cgit v1.1


From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:49 -0500
Subject: tracing: add ring_buffer_event_discard() to ring buffer

This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard
events from the ring buffer, for the event-filtering mechanism
introduced in a subsequent patch.

I did the initial version but thanks to Steven Rostedt for adding
the parts that actually made it work. ;-)

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 117 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 97 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d..a09027e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -189,16 +189,65 @@ enum {
 	RB_LEN_TIME_STAMP = 16,
 };
 
-/* inline for ring buffer fast paths */
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+}
+
+static inline int rb_discarded_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+}
+
+static void rb_event_set_padding(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	event->time_delta = 0;
+}
+
+/**
+ * ring_buffer_event_discard - discard an event in the ring buffer
+ * @buffer: the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 static unsigned
-rb_event_length(struct ring_buffer_event *event)
+rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
+	if (event->len)
+		length = event->len * RB_ALIGNMENT;
+	else
+		length = event->array[0];
+	return length + RB_EVNT_HDR_SIZE;
+}
+
+/* inline for ring buffer fast paths */
+static unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		/* undefined */
-		return -1;
+		if (rb_null_event(event))
+			/* undefined */
+			return -1;
+		return rb_event_data_length(event);
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
 		return RB_LEN_TIME_STAMP;
 
 	case RINGBUF_TYPE_DATA:
-		if (event->len)
-			length = event->len * RB_ALIGNMENT;
-		else
-			length = event->array[0];
-		return length + RB_EVNT_HDR_SIZE;
+		return rb_event_data_length(event);
 	default:
 		BUG();
 	}
@@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
-	return event->type == RINGBUF_TYPE_PADDING;
-}
-
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (tail < BUF_PAGE_SIZE) {
 			/* Mark the rest of the page with padding */
 			event = __rb_page_index(tail_page, tail);
-			event->type = RINGBUF_TYPE_PADDING;
+			rb_event_set_padding(event);
 		}
 
 		if (tail <= BUF_PAGE_SIZE)
@@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA)
+	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		RB_WARN_ON(cpu_buffer, 1);
+		if (rb_null_event(event))
+			RB_WARN_ON(cpu_buffer, 1);
+		/*
+		 * Because the writer could be discarding every
+		 * event it creates (which would probably be bad)
+		 * if we were to go back to "again" then we may never
+		 * catch up, and will trigger the warn on, or lock
+		 * the box. Return the padding, and we will release
+		 * the current locks, and try again.
+		 */
 		rb_advance_reader(cpu_buffer);
-		return NULL;
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_iter(iter);
-		goto again;
+		if (rb_null_event(event)) {
+			rb_inc_iter(iter);
+			goto again;
+		}
+		rb_advance_iter(iter);
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+ again:
 	/* might be called in atomic */
 	preempt_disable();
 
@@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	if (!event)
@@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read);
-- 
cgit v1.1


From 7ce7e4249921d5073e764f7ff7ad83cfa9894bd7 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:31:04 -0500
Subject: tracing: add per-event filtering

This patch adds per-event filtering to the event tracing subsystem.

It adds a 'filter' debugfs file to each event directory.  This file can
be written to to set filters; reading from it will display the current
set of filters set for that event.

Basically, any field listed in the 'format' file for an event can be
filtered on (including strings, but not yet other array types) using
either matching ('==') or non-matching ('!=') 'predicates'.  A
'predicate' can be either a single expression:

 # echo pid != 0 > filter

 # cat filter
 pid != 0

or a compound expression of up to 8 sub-expressions combined using '&&'
or '||':

 # echo comm == Xorg > filter
 # echo "&& sig != 29" > filter

 # cat filter
 comm == Xorg
 && sig != 29

Only events having field values matching an expression will be available
in the trace output; non-matching events are discarded.

Note that a compound expression is built up by echoing each
sub-expression separately - it's not the most efficient way to do
things, but it keeps the parser simple and assumes that compound
expressions will be relatively uncommon.  In any case, a subsequent
patch introducing a way to set filters for entire subsystems should
mitigate any need to do this for lots of events.

Setting a filter without an '&&' or '||' clears the previous filter
completely and sets the filter to the new expression:

 # cat filter
 comm == Xorg
 && sig != 29

 # echo comm != Xorg

 # cat filter
 comm != Xorg

To clear a filter, echo 0 to the filter file:

 # echo 0 > filter
 # cat filter
 none

The limit of 8 predicates for a compound expression is arbitrary - for
efficiency, it's implemented as an array of pointers to predicates, and
8 seemed more than enough for any filter...

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710665.7703.48.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.h                |  28 ++++
 kernel/trace/trace_events.c         |  77 +++++++++
 kernel/trace/trace_events_filter.c  | 326 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |   4 +
 5 files changed, 436 insertions(+)
 create mode 100644 kernel/trace/trace_events_filter.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0e45c20..2630f51 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,5 +45,6 @@ obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9288dc7..d9eb39e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -795,6 +795,7 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
+	struct filter_pred	**preds;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
@@ -803,8 +804,35 @@ struct ftrace_event_call {
 #endif
 };
 
+#define MAX_FILTER_PRED 8
+
+struct filter_pred;
+
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+
+struct filter_pred {
+	filter_pred_fn_t fn;
+	u64 val;
+	char *str_val;
+	int str_len;
+	char *field_name;
+	int offset;
+	int not;
+	int or;
+	int compound;
+	int clear;
+};
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
+extern void filter_free_pred(struct filter_pred *pred);
+extern int filter_print_preds(struct filter_pred **preds, char *buf);
+extern int filter_parse(char **pbuf, struct filter_pred *pred);
+extern int filter_add_pred(struct ftrace_event_call *call,
+			   struct filter_pred *pred);
+extern void filter_free_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 961b057..97470c4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -459,6 +459,71 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
+static ssize_t
+event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	r = filter_print_preds(call->preds, s->buffer);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64], *pbuf = buf;
+	struct filter_pred *pred;
+	int err;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return -ENOMEM;
+
+	err = filter_parse(&pbuf, pred);
+	if (err < 0) {
+		filter_free_pred(pred);
+		return err;
+	}
+
+	if (pred->clear) {
+		filter_free_preds(call);
+		return cnt;
+	}
+
+	if (filter_add_pred(call, pred)) {
+		filter_free_pred(pred);
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -504,6 +569,12 @@ static const struct file_operations ftrace_event_id_fops = {
 	.read = event_id_read,
 };
 
+static const struct file_operations ftrace_event_filter_fops = {
+	.open = tracing_open_generic,
+	.read = event_filter_read,
+	.write = event_filter_write,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -619,6 +690,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
+	entry = debugfs_create_file("filter", 0444, call->dir, call,
+				    &ftrace_event_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", call->name);
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 0000000..8e8c5fa
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,326 @@
+/*
+ * trace_events_filter - generic event filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+
+#include "trace.h"
+
+static int filter_pred_64(struct filter_pred *pred, void *event)
+{
+	u64 *addr = (u64 *)(event + pred->offset);
+	u64 val = (u64)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_32(struct filter_pred *pred, void *event)
+{
+	u32 *addr = (u32 *)(event + pred->offset);
+	u32 val = (u32)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_16(struct filter_pred *pred, void *event)
+{
+	u16 *addr = (u16 *)(event + pred->offset);
+	u16 val = (u16)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_8(struct filter_pred *pred, void *event)
+{
+	u8 *addr = (u8 *)(event + pred->offset);
+	u8 val = (u8)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_string(struct filter_pred *pred, void *event)
+{
+	char *addr = (char *)(event + pred->offset);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+	match = (!cmp) ^ pred->not;
+
+	return match;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
+{
+	int i, matched, and_failed = 0;
+	struct filter_pred *pred;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (call->preds[i]) {
+			pred = call->preds[i];
+			if (and_failed && !pred->or)
+				continue;
+			matched = pred->fn(pred, rec);
+			if (!matched && !pred->or) {
+				and_failed = 1;
+				continue;
+			} else if (matched && pred->or)
+				return 1;
+		} else
+			break;
+	}
+
+	if (and_failed)
+		return 0;
+
+	return 1;
+}
+
+int filter_print_preds(struct filter_pred **preds, char *buf)
+{
+	ssize_t this_len = 0;
+	char *field_name;
+	struct filter_pred *pred;
+	int i;
+
+	if (!preds) {
+		this_len += sprintf(buf + this_len, "none\n");
+		return this_len;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (preds[i]) {
+			pred = preds[i];
+			field_name = pred->field_name;
+			if (i)
+				this_len += sprintf(buf + this_len,
+					    pred->or ? "|| " : "&& ");
+			this_len += sprintf(buf + this_len,
+					    "%s ", field_name);
+			this_len += sprintf(buf + this_len,
+					    pred->not ? "!= " : "== ");
+			if (pred->str_val)
+				this_len += sprintf(buf + this_len,
+						    "%s\n", pred->str_val);
+			else
+				this_len += sprintf(buf + this_len,
+						    "%llu\n", pred->val);
+		} else
+			break;
+	}
+
+	return this_len;
+}
+
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+	struct ftrace_event_field *field;
+	struct list_head *entry, *tmp;
+
+	list_for_each_safe(entry, tmp, &call->fields) {
+		field = list_entry(entry, struct ftrace_event_field, link);
+		if (!strcmp(field->name, name))
+			return field;
+	}
+
+	return NULL;
+}
+
+void filter_free_pred(struct filter_pred *pred)
+{
+	if (!pred)
+		return;
+
+	kfree(pred->field_name);
+	kfree(pred->str_val);
+	kfree(pred);
+}
+
+void filter_free_preds(struct ftrace_event_call *call)
+{
+	int i;
+
+	if (call->preds) {
+		for (i = 0; i < MAX_FILTER_PRED; i++)
+			filter_free_pred(call->preds[i]);
+		kfree(call->preds);
+		call->preds = NULL;
+	}
+}
+
+static int __filter_add_pred(struct ftrace_event_call *call,
+			     struct filter_pred *pred)
+{
+	int i;
+
+	if (call->preds && !pred->compound)
+		filter_free_preds(call);
+
+	if (!call->preds) {
+		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+				      GFP_KERNEL);
+		if (!call->preds)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (!call->preds[i]) {
+			call->preds[i] = pred;
+			return 0;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int is_string_field(const char *type)
+{
+	if (strchr(type, '[') && strstr(type, "char"))
+		return 1;
+
+	return 0;
+}
+
+int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+{
+	struct ftrace_event_field *field;
+
+	field = find_event_field(call, pred->field_name);
+	if (!field)
+		return -EINVAL;
+
+	pred->offset = field->offset;
+
+	if (is_string_field(field->type)) {
+		pred->fn = filter_pred_string;
+		pred->str_len = field->size;
+		return __filter_add_pred(call, pred);
+	}
+
+	switch (field->size) {
+	case 8:
+		pred->fn = filter_pred_64;
+		break;
+	case 4:
+		pred->fn = filter_pred_32;
+		break;
+	case 2:
+		pred->fn = filter_pred_16;
+		break;
+	case 1:
+		pred->fn = filter_pred_8;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return __filter_add_pred(call, pred);
+}
+
+int filter_parse(char **pbuf, struct filter_pred *pred)
+{
+	char *tmp, *tok, *val_str = NULL;
+	int tok_n = 0;
+
+	/* field ==/!= number, or/and field ==/!= number, number */
+	while ((tok = strsep(pbuf, " \n"))) {
+		if (tok_n == 0) {
+			if (!strcmp(tok, "0")) {
+				pred->clear = 1;
+				return 0;
+			} else if (!strcmp(tok, "&&")) {
+				pred->or = 0;
+				pred->compound = 1;
+			} else if (!strcmp(tok, "||")) {
+				pred->or = 1;
+				pred->compound = 1;
+			} else
+				pred->field_name = tok;
+			tok_n = 1;
+			continue;
+		}
+		if (tok_n == 1) {
+			if (!pred->field_name)
+				pred->field_name = tok;
+			else if (!strcmp(tok, "!="))
+				pred->not = 1;
+			else if (!strcmp(tok, "=="))
+				pred->not = 0;
+			else {
+				pred->field_name = NULL;
+				return -EINVAL;
+			}
+			tok_n = 2;
+			continue;
+		}
+		if (tok_n == 2) {
+			if (pred->compound) {
+				if (!strcmp(tok, "!="))
+					pred->not = 1;
+				else if (!strcmp(tok, "=="))
+					pred->not = 0;
+				else {
+					pred->field_name = NULL;
+					return -EINVAL;
+				}
+			} else {
+				val_str = tok;
+				break; /* done */
+			}
+			tok_n = 3;
+			continue;
+		}
+		if (tok_n == 3) {
+			val_str = tok;
+			break; /* done */
+		}
+	}
+
+	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+	if (!pred->field_name)
+		return -ENOMEM;
+
+	pred->val = simple_strtoull(val_str, &tmp, 10);
+	if (tmp == val_str) {
+		pred->str_val = kstrdup(val_str, GFP_KERNEL);
+		if (!pred->str_val)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 468938f..ebf215e 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -204,6 +204,7 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
+	struct ftrace_event_call *call = &event_##call;			\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
@@ -222,6 +223,9 @@ static void ftrace_raw_event_##call(proto)				\
 	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+									\
+	if (call->preds && !filter_match_preds(call, entry))		\
+		ring_buffer_event_discard(event);			\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
-- 
cgit v1.1


From cfb180f3e71b2a280a254c8646a9ab1beab63f84 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:31:17 -0500
Subject: tracing: add per-subsystem filtering

This patch adds per-subsystem filtering to the event tracing subsystem.

It adds a 'filter' debugfs file to each subsystem directory.  This file
can be written to to set filters; reading from it will display the
current set of filters set for that subsystem.

Basically what it does is propagate the filter down to each event
contained in the subsystem.  If a particular event doesn't have a field
with the name specified in the filter, it simply doesn't get set for
that event.  You can verify whether or not the filter was set for a
particular event by looking at the filter file for that event.

As with per-event filters, compound expressions are supported, echoing
'0' to the subsystem's filter file clears all filters in the subsystem,
etc.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710677.7703.49.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               | 15 +++++++
 kernel/trace/trace_events.c        | 86 +++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d9eb39e..f267723 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -804,6 +804,18 @@ struct ftrace_event_call {
 #endif
 };
 
+struct event_subsystem {
+	struct list_head	list;
+	const char		*name;
+	struct dentry		*entry;
+	struct filter_pred	**preds;
+};
+
+#define events_for_each(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 #define MAX_FILTER_PRED 8
 
 struct filter_pred;
@@ -832,6 +844,9 @@ extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_free_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern void filter_free_subsystem_preds(struct event_subsystem *system);
+extern int filter_add_subsystem_pred(struct event_subsystem *system,
+				     struct filter_pred *pred);
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97470c4..97d4daa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -524,6 +524,71 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+		      loff_t *ppos)
+{
+	struct event_subsystem *system = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	r = filter_print_preds(system->preds, s->buffer);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		       loff_t *ppos)
+{
+	struct event_subsystem *system = filp->private_data;
+	char buf[64], *pbuf = buf;
+	struct filter_pred *pred;
+	int err;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return -ENOMEM;
+
+	err = filter_parse(&pbuf, pred);
+	if (err < 0) {
+		filter_free_pred(pred);
+		return err;
+	}
+
+	if (pred->clear) {
+		filter_free_subsystem_preds(system);
+		return cnt;
+	}
+
+	if (filter_add_subsystem_pred(system, pred)) {
+		filter_free_pred(pred);
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -575,6 +640,12 @@ static const struct file_operations ftrace_event_filter_fops = {
 	.write = event_filter_write,
 };
 
+static const struct file_operations ftrace_subsystem_filter_fops = {
+	.open = tracing_open_generic,
+	.read = subsystem_filter_read,
+	.write = subsystem_filter_write,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -595,18 +666,13 @@ static struct dentry *event_trace_events_dir(void)
 	return d_events;
 }
 
-struct event_subsystem {
-	struct list_head	list;
-	const char		*name;
-	struct dentry		*entry;
-};
-
 static LIST_HEAD(event_subsystems);
 
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -633,6 +699,14 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 	system->name = name;
 	list_add(&system->list, &event_subsystems);
 
+	system->preds = NULL;
+
+	entry = debugfs_create_file("filter", 0444, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+
 	return system->entry;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8e8c5fa..1ab20ce 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -181,6 +181,27 @@ void filter_free_preds(struct ftrace_event_call *call)
 	}
 }
 
+void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	struct ftrace_event_call *call = __start_ftrace_events;
+	int i;
+
+	if (system->preds) {
+		for (i = 0; i < MAX_FILTER_PRED; i++)
+			filter_free_pred(system->preds[i]);
+		kfree(system->preds);
+		system->preds = NULL;
+	}
+
+	events_for_each(call) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!strcmp(call->system, system->name))
+			filter_free_preds(call);
+	}
+}
+
 static int __filter_add_pred(struct ftrace_event_call *call,
 			     struct filter_pred *pred)
 {
@@ -250,6 +271,65 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 	return __filter_add_pred(call, pred);
 }
 
+static struct filter_pred *copy_pred(struct filter_pred *pred)
+{
+	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
+	if (!new_pred)
+		return NULL;
+
+	memcpy(new_pred, pred, sizeof(*pred));
+	if (pred->str_val) {
+		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
+		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+		if (!new_pred->str_val) {
+			kfree(new_pred);
+			return NULL;
+		}
+	}
+
+	return new_pred;
+}
+
+int filter_add_subsystem_pred(struct event_subsystem *system,
+			      struct filter_pred *pred)
+{
+	struct ftrace_event_call *call = __start_ftrace_events;
+	struct filter_pred *event_pred;
+	int i;
+
+	if (system->preds && !pred->compound)
+		filter_free_subsystem_preds(system);
+
+	if (!system->preds) {
+		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+					GFP_KERNEL);
+		if (!system->preds)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (!system->preds[i]) {
+			system->preds[i] = pred;
+			break;
+		}
+		if (i == MAX_FILTER_PRED - 1)
+			return -EINVAL;
+	}
+
+	events_for_each(call) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!strcmp(call->system, system->name)) {
+			event_pred = copy_pred(pred);
+			if (event_pred)
+				filter_add_pred(call, event_pred);
+		}
+	}
+
+	return 0;
+}
+
 int filter_parse(char **pbuf, struct filter_pred *pred)
 {
 	char *tmp, *tok, *val_str = NULL;
-- 
cgit v1.1


From fe9f57f250ab4d781b99504caeb218ca2db14c1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Mar 2009 18:41:59 +0100
Subject: tracing: add run-time field descriptions for event filtering, kfree
 fix

Impact: fix potential kfree of random data in (rare) failure path

Zero-initialize the field structure.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237710639.7703.46.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97d4daa..594d78a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,26 +24,31 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 {
 	struct ftrace_event_field *field;
 
-	field = kmalloc(sizeof(*field), GFP_KERNEL);
+	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field)
 		goto err;
+
 	field->name = kstrdup(name, GFP_KERNEL);
 	if (!field->name)
 		goto err;
+
 	field->type = kstrdup(type, GFP_KERNEL);
 	if (!field->type)
 		goto err;
+
 	field->offset = offset;
 	field->size = size;
 	list_add(&field->link, &call->fields);
 
 	return 0;
+
 err:
 	if (field) {
 		kfree(field->name);
 		kfree(field->type);
 	}
 	kfree(field);
+
 	return -ENOMEM;
 }
 
-- 
cgit v1.1


From 9bd7d099ab3f10dd666da399c064999bae427cd9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:43 +0100
Subject: tracing/events: make the filter files writable

We need the filter files to be writable, the current
filter file permissions are only set readable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237759847-21025-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 594d78a..19f61dd 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -706,7 +706,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
-	entry = debugfs_create_file("filter", 0444, system->entry, system,
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
@@ -769,7 +769,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	entry = debugfs_create_file("filter", 0444, call->dir, call,
+	entry = debugfs_create_file("filter", 0644, call->dir, call,
 				    &ftrace_event_filter_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-- 
cgit v1.1


From 07edf7121374609709ef1b0889f6e7b8d6a62ec1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:46 +0100
Subject: tracing/events: don't use wake up for events

Impact: fix hard-lockup with sched switch events

Some ftrace events, such as sched wakeup, can be traced
while the runqueue lock is hold. Since they are using
trace_current_buffer_unlock_commit(), they call wake_up()
which can try to grab the runqueue lock too, resulting in
a deadlock.

Now for all event, we call a new helper:
trace_nowake_buffer_unlock_commit() which do pretty the same than
trace_current_buffer_unlock_commit() except than it doesn't call
trace_wake_up().

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                | 26 +++++++++++++++++++++-----
 kernel/trace/trace.h                |  2 ++
 kernel/trace/trace_events_stage_3.h |  2 +-
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6fac0f..6bad128 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -860,15 +860,25 @@ static void ftrace_trace_stack(struct trace_array *tr,
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc);
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc)
+static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc,
+					int wake)
 {
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
-	trace_wake_up();
+
+	if (wake)
+		trace_wake_up();
+}
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	__trace_buffer_unlock_commit(tr, event, flags, pc, 1);
 }
 
 struct ring_buffer_event *
@@ -882,7 +892,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
+	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+}
+
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f267723..54fd9bc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -483,6 +483,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 				  unsigned long flags, int pc);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index ebf215e..9a3bd49 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,7 +222,7 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
 									\
 	if (call->preds && !filter_match_preds(call, entry))		\
 		ring_buffer_event_discard(event);			\
-- 
cgit v1.1


From 7e6ea92df3fd7cbe74e7985c6f3e40255c44b201 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:47 +0100
Subject: tracing/ftrace: make nop-tracer use polling wait for events on pipe

Impact: display events when they arrive

Now that the events don't use wake_up() anymore, we need the nop
tracer to poll waiting for events on the pipe. Especially because
nop is useful to look at orphan traces types (traces types that
don't rely on specific tracers) because it doesn't produce traces
itself.

And unlike other tracers that trigger specific traces periodically,
nop triggers no traces by itself that can wake him.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_nop.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 9aa84bd..394f944 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly =
 	.name		= "nop",
 	.init		= nop_trace_init,
 	.reset		= nop_trace_reset,
+	.wait_pipe	= poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
-- 
cgit v1.1


From b118415bfad6d75792a85ac999e25149db8e6919 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Mar 2009 00:18:39 +0100
Subject: tracing/events: don't discard an event after commit

When we want to filter an event, the filter test is done after
the event is commited to the ring-buffer to be discarded later if
needed.

But a reader could be reading this event while we are trying to discard
it. Other kind of racy events can even happen because the event is
commited and can be read and/or consumed.

What we want is to discard the event before committing it.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237763919-21505-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_3.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 9a3bd49..9d2fa78 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,10 +222,11 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
-									\
 	if (call->preds && !filter_match_preds(call, entry))		\
 		ring_buffer_event_discard(event);			\
+									\
+	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
+									\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
-- 
cgit v1.1


From 75c8b417526529d0a7072e4d93ec99dbd483a6f4 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:28 -0500
Subject: tracing/filters: use list_for_each_entry_safe

Impact: cleanup

Use list_for_each_entry_safe instead of list_for_each_entry in
find_event_field().

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237796788.7527.35.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1ab20ce..c4a413b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -147,11 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
-	struct ftrace_event_field *field;
-	struct list_head *entry, *tmp;
+	struct ftrace_event_field *field, *next;
 
-	list_for_each_safe(entry, tmp, &call->fields) {
-		field = list_entry(entry, struct ftrace_event_field, link);
+	list_for_each_entry_safe(field, next, &call->fields, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
-- 
cgit v1.1


From ee6cdabc820a29bd607f38d9cb335c3ceddc673b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:42 -0500
Subject: tracing/filters: fix bug in copy_pred()

Impact: fix potential crash on subsystem filter expression freeing

When making a copy of the predicate, pred->field_name needs to be
duplicated in the copy as well, otherwise bad things can happen due to
later multiple frees of the same string.

This affects only per-subsystem event filtering.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237796802.7527.39.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c4a413b..fd01d80 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -276,11 +276,19 @@ static struct filter_pred *copy_pred(struct filter_pred *pred)
 		return NULL;
 
 	memcpy(new_pred, pred, sizeof(*pred));
+
+	if (pred->field_name) {
+		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+		if (!new_pred->field_name) {
+			kfree(new_pred);
+			return NULL;
+		}
+	}
+
 	if (pred->str_val) {
 		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 		if (!new_pred->str_val) {
-			kfree(new_pred);
+			filter_free_pred(new_pred);
 			return NULL;
 		}
 	}
-- 
cgit v1.1


From c4cff064be678f1e8344d907499f2a81282edc19 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:48 -0500
Subject: tracing/filters: clean up filter_add_subsystem_pred()

Impact: cleanup, memory leak fix

This patch cleans up filter_add_subsystem_pred():

- searches for the field before creating a copy of the pred

- fixes memory leak in the case a predicate isn't applied

- if -ENOMEM, makes sure there's no longer a reference to the
  pred so the caller can free the half-finished filter

- changes the confusing i == MAX_FILTER_PRED - 1 comparison
  previously remarked upon

This affects only per-subsystem event filtering.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237796808.7527.40.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c        |  1 +
 kernel/trace/trace_events_filter.c | 31 ++++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 19f61dd..fdab30d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -585,6 +585,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 
 	if (filter_add_subsystem_pred(system, pred)) {
+		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
 		return -EINVAL;
 	}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index fd01d80..4117c2e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -318,22 +318,39 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 			system->preds[i] = pred;
 			break;
 		}
-		if (i == MAX_FILTER_PRED - 1)
-			return -EINVAL;
 	}
 
+	if (i == MAX_FILTER_PRED)
+		return -EINVAL;
+
 	events_for_each(call) {
+		int err;
+
 		if (!call->name || !call->regfunc)
 			continue;
 
-		if (!strcmp(call->system, system->name)) {
-			event_pred = copy_pred(pred);
-			if (event_pred)
-				filter_add_pred(call, event_pred);
-		}
+		if (strcmp(call->system, system->name))
+			continue;
+
+		if (!find_event_field(call, pred->field_name))
+			continue;
+
+		event_pred = copy_pred(pred);
+		if (!event_pred)
+			goto oom;
+
+		err = filter_add_pred(call, event_pred);
+		if (err)
+			filter_free_pred(event_pred);
+		if (err == -ENOMEM)
+			goto oom;
 	}
 
 	return 0;
+
+oom:
+	system->preds[i] = NULL;
+	return -ENOMEM;
 }
 
 int filter_parse(char **pbuf, struct filter_pred *pred)
-- 
cgit v1.1


From 3e1f60b80cafcb5d7e8d3665b35962fbb8fb9efa Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:45 +0100
Subject: tracing/ftrace: check if debugfs is registered before creating files

Impact: fix a crash with ftrace={nop,boot} parameter

If the nop or initcall tracers are launched as boot tracers,
they will attempt to create their option directory and files.
But these tracers are registered very early and then assigned
as "boot tracers" very early if asked to.

Since they do this before debugfs has been registered (core initcall),
a crash is triggered.

Another early tracers could also come later. So we fix it by
checking if debugfs is initialized before creating the root
tracing directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ace685c..f0e1337 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3513,6 +3513,9 @@ struct dentry *tracing_init_dentry(void)
 	if (d_tracer)
 		return d_tracer;
 
+	if (!debugfs_initialized())
+		return NULL;
+
 	d_tracer = debugfs_create_dir("tracing", NULL);
 
 	if (!d_tracer && !once) {
-- 
cgit v1.1


From 45b9560895b07a4a09d55d49235c984db512c5aa Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 24 Mar 2009 01:07:24 +0300
Subject: tracing: Fix TRACING_SUPPORT dependency for PPC32

commit 40ada30f9621fbd831ac2437b9a2a399aa ("tracing: clean up menu"),
despite the "clean up" in its purpose, introduced a behavioural
change for Kconfig symbols: we no longer able to select tracing
support on PPC32 (because IRQFLAGS_SUPPORT isn't yet implemented).

The IRQFLAGS_SUPPORT is not mandatory for most tracers, tracing core
has a special case for platforms w/o irqflags (which, by the way, has
become useless as of the commit above).

Though according to Ingo Molnar, there was periodic build failures on
weird, unmaintained architectures that had no irqflags-tracing support
and hence didn't know the raw_irqs_save/restore primitives. Thus we'd
better not enable irqflags-less tracing for all architectures.

This patch restores the old behaviour for PPC32, and thus brings the
tracing back. Other architectures can either add themselves to the
exception list or (better) implement TRACE_IRQFLAGS_SUPPORT.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Acked-b: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@ozlabs.org
LKML-Reference: <20090323220724.GA9851@oksana.dev.rtsoft.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b0a46f8..8a4d729 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -63,7 +63,11 @@ config TRACING
 #
 config TRACING_SUPPORT
 	bool
-	depends on TRACE_IRQFLAGS_SUPPORT
+	# PPC32 has no irqflags tracing support, but it can use most of the
+	# tracers anyway, they were tested to build and work. Note that new
+	# exceptions to this list aren't welcomed, better implement the
+	# irqflags tracing for your architecture.
+	depends on TRACE_IRQFLAGS_SUPPORT || PPC32
 	depends on STACKTRACE_SUPPORT
 	default y
 
-- 
cgit v1.1


From 1618536961d31f9b3f55767b22d4a897f4204c26 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Mar 2009 22:17:01 +0100
Subject: tracing/function-graph-tracer: fix functions call traces imbalance

Impact: fix traces output

Sometimes one can observe an imbalance in the traces between function
calls and function return traces:

func1() {
    }
}

The curly brace inside func1() is the return of another function nested
inside func1. The return trace have been inserted in the buffer but not
the entry.
We are storing a return address on the function traces stack while we
haven't inserted its entry on the buffer, hence the imbalance on the
traces.

This is because the tracers doesn't check all failures that can happen
on buffer insertion.

This patch reports the tracing recursion failures and the ring buffer
failures. In such cases, we now restore the original return address for
the function, giving up its return trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237843021-11695-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6bad128..89f0c25 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -924,7 +924,7 @@ trace_function(struct trace_array *tr,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_graph_entry(struct trace_array *tr,
+static int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -933,15 +933,17 @@ static void __trace_graph_entry(struct trace_array *tr,
 	struct ftrace_graph_ent_entry *entry;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
-		return;
+		return 0;
 
 	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-		return;
+		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
+
+	return 1;
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -1162,6 +1164,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
+	int ret;
 	int cpu;
 	int pc;
 
@@ -1177,15 +1180,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_entry(tr, trace, flags, pc);
+		ret = __trace_graph_entry(tr, trace, flags, pc);
+	} else {
+		ret = 0;
 	}
 	/* Only do the atomic if it is not already set */
 	if (!test_tsk_trace_graph(current))
 		set_tsk_trace_graph(current);
+
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 
-	return 1;
+	return ret;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
-- 
cgit v1.1


From 1fc2d5c11918082536acf261ce6abb1f5511053f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:01 -0500
Subject: tracing/filters: use list_for_each_entry

Impact: cleanup

No need to use the safe version here, so use list_for_each_entry instead
of list_for_each_entry_safe in find_event_field().

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878841.8339.57.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4117c2e..3f0b79f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -147,9 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
-	struct ftrace_event_field *field, *next;
+	struct ftrace_event_field *field;
 
-	list_for_each_entry_safe(field, next, &call->fields, link) {
+	list_for_each_entry(field, &call->fields, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
-- 
cgit v1.1


From 09f1f245c79585383de63e3ca54d0f91824bff3a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:11 -0500
Subject: tracing/filters: free pred when clearing filters

Impact: fix (small) per trace filter modification memory leak

Free the current pred when clearing the filters via the filter files.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878851.8339.58.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fdab30d..a938138 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -516,6 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (pred->clear) {
 		filter_free_preds(call);
+		filter_free_pred(pred);
 		return cnt;
 	}
 
@@ -581,6 +582,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (pred->clear) {
 		filter_free_subsystem_preds(system);
+		filter_free_pred(pred);
 		return cnt;
 	}
 
-- 
cgit v1.1


From 4bda2d517bfa3ce3d7044e06988cdddae7adffe2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:31 -0500
Subject: tracing/filters: use trace_seq_printf() to print filters

Impact: cleanup

Instead of just using the trace_seq buffer to print the filters, use
trace_seq_printf() as it was intended to be used.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878871.8339.59.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  3 ++-
 kernel/trace/trace_events.c        |  8 ++++----
 kernel/trace/trace_events_filter.c | 25 +++++++++----------------
 3 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54fd9bc..90a848d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -840,7 +840,8 @@ struct filter_pred {
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
 extern void filter_free_pred(struct filter_pred *pred);
-extern int filter_print_preds(struct filter_pred **preds, char *buf);
+extern void filter_print_preds(struct filter_pred **preds,
+			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a938138..d132997 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -481,8 +481,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	r = filter_print_preds(call->preds, s->buffer);
-	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+	filter_print_preds(call->preds, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
 
@@ -547,8 +547,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	r = filter_print_preds(system->preds, s->buffer);
-	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+	filter_print_preds(system->preds, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 3f0b79f..9fca8bb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static int filter_pred_64(struct filter_pred *pred, void *event)
 {
@@ -108,16 +109,15 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 	return 1;
 }
 
-int filter_print_preds(struct filter_pred **preds, char *buf)
+void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
 {
-	ssize_t this_len = 0;
 	char *field_name;
 	struct filter_pred *pred;
 	int i;
 
 	if (!preds) {
-		this_len += sprintf(buf + this_len, "none\n");
-		return this_len;
+		trace_seq_printf(s, "none\n");
+		return;
 	}
 
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
@@ -125,23 +125,16 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 			pred = preds[i];
 			field_name = pred->field_name;
 			if (i)
-				this_len += sprintf(buf + this_len,
-					    pred->or ? "|| " : "&& ");
-			this_len += sprintf(buf + this_len,
-					    "%s ", field_name);
-			this_len += sprintf(buf + this_len,
-					    pred->not ? "!= " : "== ");
+				trace_seq_printf(s, pred->or ? "|| " : "&& ");
+			trace_seq_printf(s, "%s ", field_name);
+			trace_seq_printf(s, pred->not ? "!= " : "== ");
 			if (pred->str_val)
-				this_len += sprintf(buf + this_len,
-						    "%s\n", pred->str_val);
+				trace_seq_printf(s, "%s\n", pred->str_val);
 			else
-				this_len += sprintf(buf + this_len,
-						    "%llu\n", pred->val);
+				trace_seq_printf(s, "%llu\n", pred->val);
 		} else
 			break;
 	}
-
-	return this_len;
 }
 
 static struct ftrace_event_field *
-- 
cgit v1.1


From 9f58a159d022c8f2533a27708aa267adf4f0e3ce Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:42 -0500
Subject: tracing/filters: disallow integer values for string filters and vice
 versa

Impact: fix filter use boundary condition / crash

Make sure filters for string fields don't use integer values and vice
versa.  Getting it wrong can crash the system or produce bogus
results.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878882.8339.61.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9fca8bb..026be41 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -237,9 +237,14 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
+		if (!pred->str_val)
+			return -EINVAL;
 		pred->fn = filter_pred_string;
 		pred->str_len = field->size;
 		return __filter_add_pred(call, pred);
+	} else {
+		if (pred->str_val)
+			return -EINVAL;
 	}
 
 	switch (field->size) {
-- 
cgit v1.1


From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 23 Mar 2009 18:28:15 +0100
Subject: genirq: add threaded interrupt handler support

Add support for threaded interrupt handlers:

A device driver can request that its main interrupt handler runs in a
thread. To achive this the device driver requests the interrupt with
request_threaded_irq() and provides additionally to the handler a
thread function. The handler function is called in hard interrupt
context and needs to check whether the interrupt originated from the
device. If the interrupt originated from the device then the handler
can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is
returned when no further action is required. IRQ_WAKE_THREAD causes
the genirq code to invoke the threaded (main) handler. When
IRQ_WAKE_THREAD is returned handler must have disabled the interrupt
on the device level. This is mandatory for shared interrupt handlers,
but we need to do it as well for obscure x86 hardware where disabling
an interrupt on the IO_APIC level redirects the interrupt to the
legacy PIC interrupt lines.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c       |   2 +
 kernel/irq/handle.c |  31 ++++++++-
 kernel/irq/manage.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 211 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3..ca0b348 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,6 +1037,8 @@ NORET_TYPE void do_exit(long code)
 		schedule();
 	}
 
+	exit_irq_thread();
+
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9ebf779..fe8f453 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -357,8 +357,37 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	do {
 		ret = action->handler(irq, action->dev_id);
-		if (ret == IRQ_HANDLED)
+
+		switch (ret) {
+		case IRQ_WAKE_THREAD:
+			/*
+			 * Wake up the handler thread for this
+			 * action. In case the thread crashed and was
+			 * killed we just pretend that we handled the
+			 * interrupt. The hardirq handler above has
+			 * disabled the device interrupt, so no irq
+			 * storm is lurking.
+			 */
+			if (likely(!test_bit(IRQTF_DIED,
+					     &action->thread_flags))) {
+				set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+				wake_up_process(action->thread);
+			}
+
+			/*
+			 * Set it to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+			/* Fall through to add to randomness */
+		case IRQ_HANDLED:
 			status |= action->flags;
+			break;
+
+		default:
+			break;
+		}
+
 		retval |= ret;
 		action = action->next;
 	} while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99..a4c1ab8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
  */
 
 #include <linux/irq.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include "internals.h"
 
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-cpumask_var_t irq_default_affinity;
-
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
 
 		/* Oops, that failed? */
 	} while (status & IRQ_INPROGRESS);
+
+	/*
+	 * We made sure that no hardirq handler is running. Now verify
+	 * that no threaded handlers are active.
+	 */
+	wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
 }
 EXPORT_SYMBOL(synchronize_irq);
 
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
 /**
  *	irq_can_set_affinity - Check if the affinity of a given irq can be set
  *	@irq:		Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
 	return 1;
 }
 
+static void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+{
+	struct irqaction *action = desc->action;
+
+	while (action) {
+		if (action->thread)
+			set_cpus_allowed_ptr(action->thread, cpumask);
+		action = action->next;
+	}
+}
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
+	irq_set_thread_affinity(desc, cpumask);
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
 
 	spin_lock_irqsave(&desc->lock, flags);
 	ret = setup_affinity(irq, desc);
+	if (!ret)
+		irq_set_thread_affinity(desc, desc->affinity);
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return ret;
@@ -384,6 +407,93 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
+static inline int irq_thread_should_run(struct irqaction *action)
+{
+	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+}
+
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (irq_thread_should_run(action)) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		} else
+			schedule();
+	}
+	return -1;
+}
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+	struct irqaction *action = data;
+	struct irq_desc *desc = irq_to_desc(action->irq);
+	int wake;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->irqaction = action;
+
+	while (!irq_wait_for_interrupt(action)) {
+
+		atomic_inc(&desc->threads_active);
+
+		spin_lock_irq(&desc->lock);
+		if (unlikely(desc->status & IRQ_DISABLED)) {
+			/*
+			 * CHECKME: We might need a dedicated
+			 * IRQ_THREAD_PENDING flag here, which
+			 * retriggers the thread in check_irq_resend()
+			 * but AFAICT IRQ_PENDING should be fine as it
+			 * retriggers the interrupt itself --- tglx
+			 */
+			desc->status |= IRQ_PENDING;
+			spin_unlock_irq(&desc->lock);
+		} else {
+			spin_unlock_irq(&desc->lock);
+
+			action->thread_fn(action->irq, action->dev_id);
+		}
+
+		wake = atomic_dec_and_test(&desc->threads_active);
+
+		if (wake && waitqueue_active(&desc->wait_for_threads))
+			wake_up(&desc->wait_for_threads);
+	}
+
+	/*
+	 * Clear irqaction. Otherwise exit_irq_thread() would make
+	 * fuzz about an active irq thread going into nirvana.
+	 */
+	current->irqaction = NULL;
+	return 0;
+}
+
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	if (!tsk->irqaction)
+		return;
+
+	printk(KERN_ERR
+	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+
+	/*
+	 * Set the THREAD DIED flag to prevent further wakeups of the
+	 * soon to be gone threaded handler.
+	 */
+	set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -420,6 +530,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	}
 
 	/*
+	 * Threaded handler ?
+	 */
+	if (new->thread_fn) {
+		struct task_struct *t;
+
+		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+				   new->name);
+		if (IS_ERR(t))
+			return PTR_ERR(t);
+		/*
+		 * We keep the reference to the task struct even if
+		 * the thread dies to avoid that the interrupt code
+		 * references an already freed task_struct.
+		 */
+		get_task_struct(t);
+		new->thread = t;
+		wake_up_process(t);
+	}
+
+	/*
 	 * The following block of code has to be executed atomically
 	 */
 	spin_lock_irqsave(&desc->lock, flags);
@@ -456,15 +586,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
+		init_waitqueue_head(&desc->wait_for_threads);
+
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
 			ret = __irq_set_trigger(desc, irq,
 					new->flags & IRQF_TRIGGER_MASK);
 
-			if (ret) {
-				spin_unlock_irqrestore(&desc->lock, flags);
-				return ret;
-			}
+			if (ret)
+				goto out_thread;
 		} else
 			compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -532,8 +662,19 @@ mismatch:
 		dump_stack();
 	}
 #endif
+	ret = -EBUSY;
+
+out_thread:
 	spin_unlock_irqrestore(&desc->lock, flags);
-	return -EBUSY;
+	if (new->thread) {
+		struct task_struct *t = new->thread;
+
+		new->thread = NULL;
+		if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+			kthread_stop(t);
+		put_task_struct(t);
+	}
+	return ret;
 }
 
 /**
@@ -559,6 +700,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
+	struct task_struct *irqthread;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -605,6 +747,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		else
 			desc->chip->disable(irq);
 	}
+
+	irqthread = action->thread;
+	action->thread = NULL;
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
@@ -612,6 +758,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* Make sure it's not being used on another CPU: */
 	synchronize_irq(irq);
 
+	if (irqthread) {
+		if (!test_bit(IRQTF_DIED, &action->thread_flags))
+			kthread_stop(irqthread);
+		put_task_struct(irqthread);
+	}
+
 #ifdef CONFIG_DEBUG_SHIRQ
 	/*
 	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -664,9 +816,12 @@ void free_irq(unsigned int irq, void *dev_id)
 EXPORT_SYMBOL(free_irq);
 
 /**
- *	request_irq - allocate an interrupt line
+ *	request_threaded_irq - allocate an interrupt line
  *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
+ *	@handler: Function to be called when the IRQ occurs.
+ *		  Primary handler for threaded interrupts
+ *      @thread_fn: Function called from the irq handler thread
+ *                  If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -678,6 +833,15 @@ EXPORT_SYMBOL(free_irq);
  *	raises, you must take care both to initialise your hardware
  *	and to set up the interrupt handler in the right order.
  *
+ *	If you want to set up a threaded irq handler for your device
+ *	then you need to supply @handler and @thread_fn. @handler ist
+ *	still called in hard interrupt context and has to check
+ *	whether the interrupt originates from the device. If yes it
+ *	needs to disable the interrupt on the device and return
+ *	IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *	@thread_fn. This split handler design is necessary to support
+ *	shared interrupts.
+ *
  *	Dev_id must be globally unique. Normally the address of the
  *	device data structure is used as the cookie. Since the handler
  *	receives this value it makes sense to use it.
@@ -693,8 +857,9 @@ EXPORT_SYMBOL(free_irq);
  *	IRQF_TRIGGER_*		Specify active edge(s) or level
  *
  */
-int request_irq(unsigned int irq, irq_handler_t handler,
-		unsigned long irqflags, const char *devname, void *dev_id)
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+			 irq_handler_t thread_fn, unsigned long irqflags,
+			 const char *devname, void *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -742,6 +907,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		return -ENOMEM;
 
 	action->handler = handler;
+	action->thread_fn = thread_fn;
 	action->flags = irqflags;
 	action->name = devname;
 	action->dev_id = dev_id;
@@ -771,4 +937,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 #endif
 	return retval;
 }
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
-- 
cgit v1.1


From 935bd5b971f0df7c06d214d022cf8392e2f37952 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 23 Mar 2009 18:28:16 +0100
Subject: genirq: add support for threaded interrupts to devres

Some devices use devres_request_irq() for to install their interrupt
handler. Add support for threaded interrupts to devres as well.

[tglx - simplified and adapted to latest threadirq version]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/devres.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8..d06df9c 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 }
 
 /**
- *	devm_request_irq - allocate an interrupt line for a managed device
+ *	devm_request_threaded_irq - allocate an interrupt line for a managed device
  *	@dev: device to request interrupt for
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs
+ *	@thread_fn: function to be called in a threaded interrupt context. NULL
+ *		    for devices which handle everything in @handler
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
  *	If an IRQ allocated with this function needs to be freed
  *	separately, dev_free_irq() must be used.
  */
-int devm_request_irq(struct device *dev, unsigned int irq,
-		     irq_handler_t handler, unsigned long irqflags,
-		     const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			      irq_handler_t handler, irq_handler_t thread_fn,
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id)
 {
 	struct irq_devres *dr;
 	int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 	if (!dr)
 		return -ENOMEM;
 
-	rc = request_irq(irq, handler, irqflags, devname, dev_id);
+	rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+				  dev_id);
 	if (rc) {
 		devres_free(dr);
 		return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 
 	return 0;
 }
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
 
 /**
  *	devm_free_irq - free an interrupt
-- 
cgit v1.1


From f48fe81e5b032914183e9a17052313720c2cac56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Mar 2009 11:46:22 +0100
Subject: genirq: threaded irq handlers review fixups

Delta patch to address the review comments.

      - Implement warning when IRQ_WAKE_THREAD is requested and no
        thread handler installed
      - coding style fixes

Pointed-out-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 29 ++++++++++++++++++++++++-----
 kernel/irq/manage.c | 17 +++++++----------
 2 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fe8f453..38b49a9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -338,6 +338,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+	if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+		return;
+
+	printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+	       "but no thread function available.", irq, action->name);
+}
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -361,6 +370,21 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		switch (ret) {
 		case IRQ_WAKE_THREAD:
 			/*
+			 * Set result to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+
+			/*
+			 * Catch drivers which return WAKE_THREAD but
+			 * did not set up a thread function
+			 */
+			if (unlikely(!action->thread_fn)) {
+				warn_no_thread(irq, action);
+				break;
+			}
+
+			/*
 			 * Wake up the handler thread for this
 			 * action. In case the thread crashed and was
 			 * killed we just pretend that we handled the
@@ -374,11 +398,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 				wake_up_process(action->thread);
 			}
 
-			/*
-			 * Set it to handled so the spurious check
-			 * does not trigger.
-			 */
-			ret = IRQ_HANDLED;
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
 			status |= action->flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4c1ab8..a3eb7ba 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -407,20 +407,17 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
-static inline int irq_thread_should_run(struct irqaction *action)
-{
-	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-}
-
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (irq_thread_should_run(action)) {
+
+		if (test_and_clear_bit(IRQTF_RUNTHREAD,
+				       &action->thread_flags)) {
 			__set_current_state(TASK_RUNNING);
 			return 0;
-		} else
-			schedule();
+		}
+		schedule();
 	}
 	return -1;
 }
@@ -820,8 +817,8 @@ EXPORT_SYMBOL(free_irq);
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs.
  *		  Primary handler for threaded interrupts
- *      @thread_fn: Function called from the irq handler thread
- *                  If NULL, no irq thread is created
+ *	@thread_fn: Function called from the irq handler thread
+ *		    If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
-- 
cgit v1.1


From e4955c9986a27bb47ddeb6cd55803053f547e2e9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:04:37 +0800
Subject: blktrace: mark ddir_act[] const

Impact: cleanup

ddir_act and what2act always stay immutable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C89415.5080503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 108f4f7..1ffcbd4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,8 +147,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 /*
  * Data direction bit lookup
  */
-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
-					 BLK_TC_ACT(BLK_TC_WRITE) };
+static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
+				 BLK_TC_ACT(BLK_TC_WRITE) };
 
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
@@ -1116,10 +1116,10 @@ static void blk_tracer_reset(struct trace_array *tr)
 	blk_tracer_stop(tr);
 }
 
-static struct {
+static const struct {
 	const char *act[2];
 	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
-} what2act[] __read_mostly = {
+} what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
-- 
cgit v1.1


From 65796348e09880e12b97267d39b8857c758440a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:06 +0800
Subject: blktrace: fix wrong calculation of RWBS

Impact: fix the output of IO type category characters

Trace categories are the upper 16 bits, not the lower 16 bits.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C89432.8010805@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1ffcbd4..9af4143 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -922,23 +922,24 @@ static void blk_unregister_tracepoints(void)
 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 {
 	int i = 0;
+	int tc = t->action >> BLK_TC_SHIFT;
 
-	if (t->action & BLK_TC_DISCARD)
+	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
-	else if (t->action & BLK_TC_WRITE)
+	else if (tc & BLK_TC_WRITE)
 		rwbs[i++] = 'W';
 	else if (t->bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 
-	if (t->action & BLK_TC_AHEAD)
+	if (tc & BLK_TC_AHEAD)
 		rwbs[i++] = 'A';
-	if (t->action & BLK_TC_BARRIER)
+	if (tc & BLK_TC_BARRIER)
 		rwbs[i++] = 'B';
-	if (t->action & BLK_TC_SYNC)
+	if (tc & BLK_TC_SYNC)
 		rwbs[i++] = 'S';
-	if (t->action & BLK_TC_META)
+	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
-- 
cgit v1.1


From e0dc81bec0927fa0c8aabc521793161909eef7a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:51 +0800
Subject: blktrace: fix t_error()

Impact: fix error flag output

t_error() should return t->error but not t->sector.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C8945F.5020802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9af4143..f69f8bd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -968,7 +968,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent)
 
 static inline __u16 t_error(const struct trace_entry *ent)
 {
-	return te_blk_io_trace(ent)->sector;
+	return te_blk_io_trace(ent)->error;
 }
 
 static __u64 get_pdu_int(const struct trace_entry *ent)
-- 
cgit v1.1


From 093419971e03362a00f499960557c119982ea09f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 17:43:30 +0800
Subject: blktrace: print human-readable act_mask

Impact: new feature, allow symbolic values in /debug/tracing/act_mask

Print stringified act_mask instead of hex value:

 # cat act_mask
 read,write,barrier,sync,queue,requeue,issue,complete,fs,pc,ahead,meta,
 discard,drv_data
 # echo "meta,write" > act_mask
 # cat act_mask
 write,meta

Also:
 - make act_mask accept "ahead", "meta", "discard" and "drv_data"
 - use strsep() instead of strchr() to parse user input
 - return -EINVAL if a token is not found in the mask map
 - fix a bug that 'value' is unsigned, so it can < 0
 - propagate error value of blk_trace_mask2str() to userspace, but not
   always return -ENXIO.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C8AB42.1000802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 103 ++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f69f8bd..6fb274f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1316,53 +1316,77 @@ struct attribute_group blk_trace_attr_group = {
 	.attrs = blk_trace_attrs,
 };
 
-static int blk_str2act_mask(const char *str)
+static const struct {
+	int mask;
+	const char *str;
+} mask_maps[] = {
+	{ BLK_TC_READ,		"read"		},
+	{ BLK_TC_WRITE,		"write"		},
+	{ BLK_TC_BARRIER,	"barrier"	},
+	{ BLK_TC_SYNC,		"sync"		},
+	{ BLK_TC_QUEUE,		"queue"		},
+	{ BLK_TC_REQUEUE,	"requeue"	},
+	{ BLK_TC_ISSUE,		"issue"		},
+	{ BLK_TC_COMPLETE,	"complete"	},
+	{ BLK_TC_FS,		"fs"		},
+	{ BLK_TC_PC,		"pc"		},
+	{ BLK_TC_AHEAD,		"ahead"		},
+	{ BLK_TC_META,		"meta"		},
+	{ BLK_TC_DISCARD,	"discard"	},
+	{ BLK_TC_DRV_DATA,	"drv_data"	},
+};
+
+static int blk_trace_str2mask(const char *str)
 {
+	int i;
 	int mask = 0;
-	char *copy = kstrdup(str, GFP_KERNEL), *s;
+	char *s, *token;
 
-	if (copy == NULL)
+	s = kstrdup(str, GFP_KERNEL);
+	if (s == NULL)
 		return -ENOMEM;
-
-	s = strstrip(copy);
+	s = strstrip(s);
 
 	while (1) {
-		char *sep = strchr(s, ',');
-
-		if (sep != NULL)
-			*sep = '\0';
-
-		if (strcasecmp(s, "barrier") == 0)
-			mask |= BLK_TC_BARRIER;
-		else if (strcasecmp(s, "complete") == 0)
-			mask |= BLK_TC_COMPLETE;
-		else if (strcasecmp(s, "fs") == 0)
-			mask |= BLK_TC_FS;
-		else if (strcasecmp(s, "issue") == 0)
-			mask |= BLK_TC_ISSUE;
-		else if (strcasecmp(s, "pc") == 0)
-			mask |= BLK_TC_PC;
-		else if (strcasecmp(s, "queue") == 0)
-			mask |= BLK_TC_QUEUE;
-		else if (strcasecmp(s, "read") == 0)
-			mask |= BLK_TC_READ;
-		else if (strcasecmp(s, "requeue") == 0)
-			mask |= BLK_TC_REQUEUE;
-		else if (strcasecmp(s, "sync") == 0)
-			mask |= BLK_TC_SYNC;
-		else if (strcasecmp(s, "write") == 0)
-			mask |= BLK_TC_WRITE;
-
-		if (sep == NULL)
+		token = strsep(&s, ",");
+		if (token == NULL)
 			break;
 
-		s = sep + 1;
+		if (*token == '\0')
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+			if (strcasecmp(token, mask_maps[i].str) == 0) {
+				mask |= mask_maps[i].mask;
+				break;
+			}
+		}
+		if (i == ARRAY_SIZE(mask_maps)) {
+			mask = -EINVAL;
+			break;
+		}
 	}
-	kfree(copy);
+	kfree(s);
 
 	return mask;
 }
 
+static ssize_t blk_trace_mask2str(char *buf, int mask)
+{
+	int i;
+	char *p = buf;
+
+	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+		if (mask & mask_maps[i].mask) {
+			p += sprintf(p, "%s%s",
+				    (p == buf) ? "" : ",", mask_maps[i].str);
+		}
+	}
+	*p++ = '\n';
+
+	return p - buf;
+}
+
 static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
 {
 	if (bdev->bd_disk == NULL)
@@ -1399,7 +1423,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
-		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+		ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
 	else if (attr == &dev_attr_pid)
 		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
 	else if (attr == &dev_attr_start_lba)
@@ -1424,7 +1448,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	struct request_queue *q;
 	struct hd_struct *p;
 	u64 value;
-	ssize_t ret = -ENXIO;
+	ssize_t ret = -EINVAL;
 
 	if (count == 0)
 		goto out;
@@ -1432,13 +1456,16 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (attr == &dev_attr_act_mask) {
 		if (sscanf(buf, "%llx", &value) != 1) {
 			/* Assume it is a list of trace category names */
-			value = blk_str2act_mask(buf);
-			if (value < 0)
+			ret = blk_trace_str2mask(buf);
+			if (ret < 0)
 				goto out;
+			value = ret;
 		}
 	} else if (sscanf(buf, "%llu", &value) != 1)
 		goto out;
 
+	ret = -ENXIO;
+
 	lock_kernel();
 	p = dev_to_part(dev);
 	bdev = bdget(part_devt(p));
-- 
cgit v1.1


From 098335215a4921a8a54193829eaed602dca24df5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 21 Mar 2009 02:44:50 -0400
Subject: tracing: fix memory leak in trace_stat

If the function profiler does not have any items recorded and one were
to cat the function stat file, the kernel would take a BUG with a NULL
pointer dereference.

Looking further into this, I found that returning NULL from stat_start
did not stop the stat logic, and would later call stat_next. This breaks
from the way seq_file works, so I looked into fixing the stat code.

This is where I noticed that the last next_entry is never freed.
It is allocated, and if the stat_next returns NULL, the code breaks out
of the loop, unlocks the mutex and exits. We never link the next_entry
nor do we free it. Thus it is a real memory leak.

This patch rearranges the code a bit to not only fix the memory leak,
but also to act more like seq_file where nothing is printed if there
is nothing to print. That is, stat_start returns NULL.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stat.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 39310e3..f71b85b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -75,7 +75,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
-	void *prev_stat;
+	void *stat;
 	int ret = 0;
 	int i;
 
@@ -85,6 +85,10 @@ static int stat_seq_init(struct tracer_stat_session *session)
 	if (!ts->stat_cmp)
 		ts->stat_cmp = dummy_cmp;
 
+	stat = ts->stat_start();
+	if (!stat)
+		goto exit;
+
 	/*
 	 * The first entry. Actually this is the second, but the first
 	 * one (the stat_list head) is pointless.
@@ -99,14 +103,19 @@ static int stat_seq_init(struct tracer_stat_session *session)
 
 	list_add(&new_entry->list, &session->stat_list);
 
-	new_entry->stat = ts->stat_start();
-	prev_stat = new_entry->stat;
+	new_entry->stat = stat;
 
 	/*
 	 * Iterate over the tracer stat entries and store them in a sorted
 	 * list.
 	 */
 	for (i = 1; ; i++) {
+		stat = ts->stat_next(stat, i);
+
+		/* End of insertion */
+		if (!stat)
+			break;
+
 		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
 		if (!new_entry) {
 			ret = -ENOMEM;
@@ -114,11 +123,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 		}
 
 		INIT_LIST_HEAD(&new_entry->list);
-		new_entry->stat = ts->stat_next(prev_stat, i);
-
-		/* End of insertion */
-		if (!new_entry->stat)
-			break;
+		new_entry->stat = stat;
 
 		list_for_each_entry(iter_entry, &session->stat_list, list) {
 
@@ -137,8 +142,6 @@ static int stat_seq_init(struct tracer_stat_session *session)
 				break;
 			}
 		}
-
-		prev_stat = new_entry->stat;
 	}
 exit:
 	mutex_unlock(&session->stat_mutex);
-- 
cgit v1.1


From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:38:49 -0400
Subject: function-graph: moved the timestamp from arch to generic code

This patch move the timestamp from happening in the arch specific
code into the general code. This allows for better control by the tracer
to time manipulation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e876816..d28687e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 {
+	unsigned long long calltime;
 	int index;
 
 	if (!current->ret_stack)
@@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
 		return -EBUSY;
 	}
 
+	calltime = trace_clock_local();
+
 	index = ++current->curr_ret_stack;
 	barrier();
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
+	current->ret_stack[index].calltime = calltime;
 	*depth = index;
 
 	return 0;
-- 
cgit v1.1


From 05ce5818adee8f8efd0a5ca0d900a6789012516b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 00:18:31 -0400
Subject: function-graph: prevent more than one tracer registering

Impact: prevent crash due to multiple function graph tracers

The function graph tracer can currently only handle a single tracer
being registered. If another tracer registers with the function
graph tracer it can crash the system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7847806..c81a759 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2643,6 +2643,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_lock);
 
+	/* we currently allow only one tracer registered at a time */
+	if (atomic_read(&ftrace_graph_active)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 
-- 
cgit v1.1


From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 01:10:15 -0400
Subject: function-graph: ignore times across schedule

Impact: more accurate timings

The current method of function graph tracing does not take into
account the time spent when a task is not running. This shows functions
that call schedule have increased costs:

 3) + 18.664 us   |      }
 ------------------------------------------
 3)    <idle>-0    =>  kblockd-123
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   1.441 us    |        _spin_unlock_irq();
 3)   3.966 us    |      }
 3) ! 2959.433 us |    }
 3) ! 2961.465 us |  }

This patch uses the tracepoint in the scheduling context switch to
account for time that has elapsed while a task is scheduled out.
Now we see:

 ------------------------------------------
 3)    <idle>-0    =>  edac-po-1067
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   0.685 us    |        _spin_unlock_irq();
 3)   2.331 us    |      }
 3) + 41.439 us   |    }
 3) + 42.663 us   |  }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c81a759..0b90364 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,6 +29,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+#include <trace/sched.h>
+
 #include <asm/ftrace.h>
 
 #include "trace.h"
@@ -2590,6 +2592,31 @@ free:
 	return ret;
 }
 
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+				struct task_struct *next)
+{
+	unsigned long long timestamp;
+	int index;
+
+	timestamp = trace_clock_local();
+
+	prev->ftrace_timestamp = timestamp;
+
+	/* only process tasks that we timestamped */
+	if (!next->ftrace_timestamp)
+		return;
+
+	/*
+	 * Update all the counters in next to make up for the
+	 * time next was sleeping.
+	 */
+	timestamp -= next->ftrace_timestamp;
+
+	for (index = next->curr_ret_stack; index >= 0; index--)
+		next->ret_stack[index].calltime += timestamp;
+}
+
 /* Allocate a return stack for each task */
 static int start_graph_tracing(void)
 {
@@ -2611,6 +2638,13 @@ static int start_graph_tracing(void)
 		ret = alloc_retstack_tasklist(ret_stack_list);
 	} while (ret == -EAGAIN);
 
+	if (!ret) {
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+		if (ret)
+			pr_info("ftrace_graph: Couldn't activate tracepoint"
+				" probe to kernel_sched_switch\n");
+	}
+
 	kfree(ret_stack_list);
 	return ret;
 }
@@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void)
 	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
@@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
+		t->ftrace_timestamp = 0;
 	} else
 		t->ret_stack = NULL;
 }
-- 
cgit v1.1


From be6f164a02f394675e2ac2077dd354cebef5b4c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 11:06:24 -0400
Subject: function-graph: add option for include sleep times

Impact: give user a choice to show times spent while sleeping

The user may want to see the time a function spent sleeping.
This patch adds the trace option "sleep-time" to allow that.
The "sleep-time" option is default on.

 echo sleep-time > /debug/tracing/trace_options

produces:

 ------------------------------------------
 2)  avahi-d-3428  =>    <idle>-0
 ------------------------------------------

 2)               |      finish_task_switch() {
 2)   0.621 us    |        _spin_unlock_irq();
 2)   2.202 us    |      }
 2) ! 1002.197 us |    }
 2) ! 1003.521 us |  }

where as,

 echo nosleep-time > /debug/tracing/trace_options

produces:

 0)    <idle>-0    =>  yum-upd-3416
 ------------------------------------------

 0)               |              finish_task_switch() {
 0)   0.643 us    |                _spin_unlock_irq();
 0)   2.342 us    |              }
 0) + 41.302 us   |            }
 0) + 42.453 us   |          }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 7 +++++++
 kernel/trace/trace.c  | 3 ++-
 kernel/trace/trace.h  | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0b90364..02d2de9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2599,6 +2599,13 @@ ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	unsigned long long timestamp;
 	int index;
 
+	/*
+	 * Does the user want to count the time a function was asleep.
+	 * If so, do not update the time stamps.
+	 */
+	if (trace_flags & TRACE_ITER_SLEEP_TIME)
+		return;
+
 	timestamp = trace_clock_local();
 
 	prev->ftrace_timestamp = timestamp;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f0e1337..67c6a21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -316,6 +316,7 @@ static const char *trace_options[] = {
 	"context-info",
 	"latency-format",
 	"global-clock",
+	"sleep-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7cfb741..d7410bb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -683,6 +683,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
+	TRACE_ITER_SLEEP_TIME		= 0x100000,
 };
 
 /*
-- 
cgit v1.1


From cc59c9e8d0165c632fd056c4a23e36f917507fb4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 11:03:01 +0800
Subject: ftrace: show virtual PID

Impact: fix PID output under namespaces

When current namespace is not the global namespace,
pid read from set_ftrace_pid is no correct.

 # ~/newpid_namespace_run bash
 # echo $$
 1
 # echo 1 > set_ftrace_pid
 # cat set_ftrace_pid
 3756

Since we write virtual PID to set_ftrace_pid, we need get
virtual PID when we read it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C84D65.9050606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02d2de9..bb37711 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2264,7 +2264,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	if (ftrace_pid_trace == ftrace_swapper_pid)
 		r = sprintf(buf, "swapper tasks\n");
 	else if (ftrace_pid_trace)
-		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
+		r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
 
-- 
cgit v1.1


From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 13:38:06 +0800
Subject: tracing: use union for multi-usages field

Impact: cleanup

struct dyn_ftrace::ip has different usages in his lifecycle,
we use union for it. And also for struct dyn_ftrace::flags.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C871BE.3080405@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb37711..7b8722b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
-	rec->ip = (unsigned long)ftrace_free_records;
+	rec->freelist = ftrace_free_records;
 	ftrace_free_records = rec;
 	rec->flags |= FTRACE_FL_FREE;
 }
@@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 			return NULL;
 		}
 
-		ftrace_free_records = (void *)rec->ip;
+		ftrace_free_records = rec->freelist;
 		memset(rec, 0, sizeof(*rec));
 		return rec;
 	}
@@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->flags = (unsigned long)ftrace_new_addrs;
+	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
 	return rec;
@@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod)
 			return -1;
 
 		p = ftrace_new_addrs;
-		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		ftrace_new_addrs = p->newlist;
 		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-- 
cgit v1.1


From e6f489013b985b58d096a3091ece0ed579367232 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:27:17 +0800
Subject: trace_stat: don't call seq_printf() in seq_operation->start()

Impact: Fix incorrect way using seq_file's API

Use SEQ_START_TOKEN instead of calling ->stat_headers()
int seq_operation->start().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
LKML-Reference: <49C9EAE5.5070202@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f71b85b..8c129dd 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -163,7 +163,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 
 	/* If we are in the beginning of the file, print the headers */
 	if (!*pos && session->ts->stat_headers)
-		session->ts->stat_headers(s);
+		return SEQ_START_TOKEN;
 
 	return seq_list_start(&session->stat_list, *pos);
 }
@@ -172,6 +172,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
 	struct tracer_stat_session *session = s->private;
 
+	if (p == SEQ_START_TOKEN)
+		return seq_list_start(&session->stat_list, *pos);
+
 	return seq_list_next(p, &session->stat_list, pos);
 }
 
@@ -186,6 +189,9 @@ static int stat_seq_show(struct seq_file *s, void *v)
 	struct tracer_stat_session *session = s->private;
 	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
+	if (v == SEQ_START_TOKEN)
+		return session->ts->stat_headers(s);
+
 	return session->ts->stat_show(s, l->stat);
 }
 
-- 
cgit v1.1


From 220ba351dfa57eca4bec5ce0098a276446a47958 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:58:39 +0800
Subject: trace_stat: keep original order

Impact: make trace_stat files show items with the original order

trace_stat tracer reverse the items, it makes the output
looks a little ugly.

Example, when we read trace_stat/workqueues, we get cpu#7's stat.
at first, and then cpu#6... cpu#0.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C9F23F.5040307@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8c129dd..acdebd7 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -125,23 +125,21 @@ static int stat_seq_init(struct tracer_stat_session *session)
 		INIT_LIST_HEAD(&new_entry->list);
 		new_entry->stat = stat;
 
-		list_for_each_entry(iter_entry, &session->stat_list, list) {
+		list_for_each_entry_reverse(iter_entry, &session->stat_list,
+				list) {
 
 			/* Insertion with a descendent sorting */
-			if (ts->stat_cmp(new_entry->stat,
-						iter_entry->stat) > 0) {
+			if (ts->stat_cmp(iter_entry->stat,
+					new_entry->stat) >= 0) {
 
-				list_add_tail(&new_entry->list,
-						&iter_entry->list);
-				break;
-
-			/* The current smaller value */
-			} else if (list_is_last(&iter_entry->list,
-						&session->stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
 		}
+
+		/* The current larger value */
+		if (list_empty(&new_entry->list))
+			list_add(&new_entry->list, &session->stat_list);
 	}
 exit:
 	mutex_unlock(&session->stat_mutex);
-- 
cgit v1.1


From 2f63b840bc8a816ac879ee773b035cf3e433fae4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:59:18 +0800
Subject: trace_workqueues: fix empty line's output

Empty lines separate cpus stat. After previous
fix(trace_stat: keep original order) applied, the empty lines
are displayed at incorrect position.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C9F266.2060706@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 9ab035b..797201e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct pid *pid;
 	struct task_struct *tsk;
 
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
+		seq_printf(s, "\n");
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
 	pid = find_get_pid(cws->pid);
 	if (pid) {
 		tsk = get_pid_task(pid, PIDTYPE_PID);
@@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 		put_pid(pid);
 	}
 
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
 	return 0;
 }
 
 static int workqueue_stat_headers(struct seq_file *s)
 {
 	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
-	seq_printf(s, "# |      |         |          |\n\n");
+	seq_printf(s, "# |      |         |          |\n");
 	return 0;
 }
 
-- 
cgit v1.1


From 2a4efa42450762cbfa5c5712aa4cc9f06924c9fd Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 12:06:05 +0800
Subject: ftrace: Using FTRACE_WARN_ON() to check "freed record" in
 ftrace_release()

 "Because when we call ftrace_free_rec we change the rec->ip to point to the
  next record in the chain. Something is very wrong if rec->ip >= s &&
  rec->ip < e and the record is already free."

 "Note, use FTRACE_WARN_ON() macro. This way it shuts down ftrace if it is
  hit and helps to avoid further damage later."
                   -- Steven Rostedt <rostedt@goodmis.org>

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722b..1752a63 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -358,9 +358,14 @@ void ftrace_release(void *start, unsigned long size)
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
 			ftrace_free_rec(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
-- 
cgit v1.1


From 9a8118baaeb0eaa148913bed77bf9c6335f6ca63 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 26 Mar 2009 01:24:34 -0500
Subject: tracing: filter fix for TRACE_EVENT_FORMAT events

Impact: fix crash (hang) when using TRACE_EVENT_FORMAT filter files

filters are only hooked up to the tracepoint events defined using
TRACE_EVENT but not the tracers that use TRACE_EVENT_FORMAT, such
as ftrace.

Do not display the filter files at all for TRACE_EVENT_FORMAT events
for the time being.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878882.8339.61.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d132997..64ec4d2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -680,7 +680,6 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
-	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -709,12 +708,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
-	entry = debugfs_create_file("filter", 0644, system->entry, system,
-				    &ftrace_subsystem_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", name);
-
 	return system->entry;
 }
 
@@ -770,14 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
+		entry = debugfs_create_file("filter", 0644, call->dir, call,
+					    &ftrace_event_filter_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs "
+				   "'%s/filter' entry\n", call->name);
 	}
 
-	entry = debugfs_create_file("filter", 0644, call->dir, call,
-				    &ftrace_event_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
-- 
cgit v1.1


From d5ac537e5fb6fc12384c9f3ed6a15e912dfbbc2a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 28 Mar 2009 21:52:47 -0700
Subject: sched: fix errors in struct & function comments

Fix kernel-doc errors in sched.c:  the structs don't have
kernel-doc notation and the short function description needs to
be one line only.

  Error(kernel/sched.c:3197): cannot understand prototype: 'struct sd_lb_stats '
  Error(kernel/sched.c:3228): cannot understand prototype: 'struct sg_lb_stats '
  Error(kernel/sched.c:3375): duplicate section name 'Description'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f4c413b..5757e03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3190,7 +3190,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 /********** Helpers for find_busiest_group ************************/
-/**
+/*
  * sd_lb_stats - Structure to store the statistics of a sched_domain
  * 		during load balancing.
  */
@@ -3222,7 +3222,7 @@ struct sd_lb_stats {
 #endif
 };
 
-/**
+/*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
 struct sg_lb_stats {
@@ -3360,16 +3360,17 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
 }
 
 /**
- * check_power_save_busiest_group - Check if we have potential to perform
- *	some power-savings balance. If yes, set the busiest group to be
- *	the least loaded group in the sched_domain, so that it's CPUs can
- *	be put to idle.
- *
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
  * @sds: Variable containing the statistics of the sched_domain
  *	under consideration.
  * @this_cpu: Cpu at which we're currently performing load-balancing.
  * @imbalance: Variable to store the imbalance.
  *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
  * Returns 1 if there is potential to perform power-savings balance.
  * Else returns 0.
  */
-- 
cgit v1.1


From 1a2142afa5646ad5af44bbe1febaa5e0b7e71156 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:10 -0600
Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL

Impact: cleanup

(Thanks to Al Viro for reminding me of this, via Ingo)

CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so:

	#define CPU_MASK_ALL (cpumask_t) { { ... } }

Taking the address of such a temporary is questionable at best,
unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added
CPU_MASK_ALL_PTR:

	#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)

Which formalizes this practice.  One day gcc could bite us over this
usage (though we seem to have gotten away with it so far).

So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR
with the modern "cpu_all_mask" (a real const struct cpumask *).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
---
 kernel/kmod.c    | 2 +-
 kernel/kthread.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f6..f0c8f54 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data)
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 
 	/*
 	 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456..84bbadd 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
 		 */
 		sched_setscheduler(create->result, SCHED_NORMAL, &param);
 		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
+		set_cpus_allowed_ptr(create->result, cpu_all_mask);
 	}
 	complete(&create->done);
 }
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
-- 
cgit v1.1


From 2b17fa506c418e9fb02bbbc7f426d2bbb5b247a6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:12 -0600
Subject: cpumask: use set_cpu_active in init/main.c

cpu_active_map is deprecated in favor of cpu_active_mask, which is
const for safety: we use accessors now (set_cpu_active) is we really
want to make a change.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/cpu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f0..395b697 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	cpu_clear(cpu, cpu_active_map);
+	set_cpu_active(cpu, false);
 
 	/*
 	 * Make sure the all cpus did the reschedule and are not
@@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
 	err = _cpu_down(cpu, 0);
 
 	if (cpu_online(cpu))
-		cpu_set(cpu, cpu_active_map);
+		set_cpu_active(cpu, true);
 
 out:
 	cpu_maps_update_done();
@@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	cpu_set(cpu, cpu_active_map);
+	set_cpu_active(cpu, true);
 
 	/* Now call notifier in preparation. */
 	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
-- 
cgit v1.1


From 9489424454c93f4d225d7af47978f8c7e84bf4d4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:12 -0600
Subject: cpumask: use mm_cpumask() wrapper: kernel/fork.c

Impact: futureproof

Makes code futureproof against the impending change to mm->cpu_vm_mask.

It's also a chance to use the new cpumask_ ops which take a pointer.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6715ebc..47c1584 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -284,7 +284,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	cpus_clear(mm->cpu_vm_mask);
+	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
-- 
cgit v1.1


From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:15 -0600
Subject: cpumask: use new cpumask_ functions in core code.

Impact: cleanup

Time to clean up remaining laggards using the old cpu_ functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Trond.Myklebust@netapp.com
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509..9aedd9f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -416,7 +416,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +547,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -911,7 +911,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
  	cpu_maps_update_done();
 
-- 
cgit v1.1


From 73d0a4b107d58908305f272bfae9bd17f74a2c81 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:16 -0600
Subject: cpumask: convert rcutorture.c

We're getting rid of cpumasks on the stack.

Simply change tmp_mask to a global, and allocate it in
rcu_torture_init().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
---
 kernel/rcutorture.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a..9b4a975 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
 static long n_rcu_torture_timers = 0;
 static struct list_head rcu_torture_removed;
+static cpumask_var_t shuffle_tmp_mask;
 
 static int stutter_pause_test = 0;
 
@@ -889,10 +890,9 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_t tmp_mask;
 	int i;
 
-	cpus_setall(tmp_mask);
+	cpumask_setall(shuffle_tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
 	}
 
 	if (rcu_idle_cpu != -1)
-		cpu_clear(rcu_idle_cpu, tmp_mask);
+		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
 
-	set_cpus_allowed_ptr(current, &tmp_mask);
+	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
-						     &tmp_mask);
+						     shuffle_tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     &tmp_mask);
+						     shuffle_tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, &tmp_mask);
+		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, &tmp_mask);
+		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
 	if (shuffler_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
+		free_cpumask_var(shuffle_tmp_mask);
 	}
 	shuffler_task = NULL;
 
@@ -1190,10 +1191,18 @@ rcu_torture_init(void)
 	}
 	if (test_no_idle_hz) {
 		rcu_idle_cpu = num_online_cpus() - 1;
+
+		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+			firsterr = -ENOMEM;
+			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
+			goto unwind;
+		}
+
 		/* Create the shuffler thread */
 		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
 					  "rcu_torture_shuffle");
 		if (IS_ERR(shuffler_task)) {
+			free_cpumask_var(shuffle_tmp_mask);
 			firsterr = PTR_ERR(shuffler_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
 			shuffler_task = NULL;
-- 
cgit v1.1


From 612a726faf8486fa48b34fa37115ce1e7421d383 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:16 -0600
Subject: cpumask: remove cpumask_t from core

Impact: cleanup

struct cpumask is nicer, and we use it to make where we've made code
safe for CONFIG_CPUMASK_OFFSTACK=y.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/sched_cpupri.h | 2 +-
 kernel/stop_machine.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94e..9a7e859 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
 
 #ifdef CONFIG_SMP
 int  cpupri_find(struct cpupri *cp,
-		 struct task_struct *p, cpumask_t *lowest_mask);
+		 struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
 int cpupri_init(struct cpupri *cp, bool bootmem);
 void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 74541ca..912823e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
 static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
-static const cpumask_t *active_cpus;
+static const struct cpumask *active_cpus;
 static void *stop_machine_work;
 
 static void set_state(enum stopmachine_state newstate)
-- 
cgit v1.1


From 2a93a1f21480f3aa134fe0c48954b64b8a97ef25 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <ukleinek@strlen.de>
Date: Mon, 12 Jan 2009 23:35:50 +0100
Subject: trivial: fix typo "resgister" -> "register"

Signed-off-by: Uwe Kleine-Koenig <ukleinek@strlen.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913d..53e8c8b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 }
 
 /**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
  * @ops - ops structure that holds the function to unregister
  *
  * Unregister a function that was added to be called by ftrace profiling.
-- 
cgit v1.1


From 877d03105d04b2c13e241130277fa69c8d2564f0 Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 26 Jan 2009 11:06:57 +0100
Subject: trivial: Fix misspelling of firmware

Fix misspelling of firmware.

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/power/disk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206..9d1c1a0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -265,7 +265,7 @@ static int create_image(int platform_mode)
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the power transition.
+ *			 prepare the platform firmware for the power transition.
  *
  *	Must be called with pm_mutex held
  */
@@ -378,7 +378,7 @@ static int resume_target_kernel(void)
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the transition.
+ *			 prepare the platform firmware for the transition.
  *
  *	Must be called with pm_mutex held
  */
-- 
cgit v1.1


From 692105b8ac5bcd75dc65f6a8f10bdbd0f0f34dcf Mon Sep 17 00:00:00 2001
From: Matt LaPlante <kernel1@cyberdogtech.com>
Date: Mon, 26 Jan 2009 11:12:25 +0100
Subject: trivial: fix typos/grammar errors in Kconfig texts

Signed-off-by: Matt LaPlante <kernel1@cyberdogtech.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/Kconfig | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34e707e..504086a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER
 	help
 	  Enable the kernel to trace a function at both its return
 	  and its entry.
-	  It's first purpose is to trace the duration of functions and
-	  draw a call graph for each thread with some informations like
-	  the return value.
-	  This is done by setting the current return address on the current
-	  task structure into a stack of calls.
+	  Its first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some information like
+	  the return value. This is done by setting the current return 
+	  address on the current task structure into a stack of calls.
 
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
-- 
cgit v1.1


From 0a0c5168df270a50e3518e4f12bddb31f8f5f38f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:33:49 +0100
Subject: PM: Introduce functions for suspending and resuming device interrupts

Introduce helper functions allowing us to prevent device drivers from
getting any interrupts (without disabling interrupts on the CPU)
during suspend (or hibernation) and to make them start to receive
interrupts again during the subsequent resume.  These functions make it
possible to keep timer interrupts enabled while the "late" suspend and
"early" resume callbacks provided by device drivers are being
executed.  In turn, this allows device drivers' "late" suspend and
"early" resume callbacks to sleep, execute ACPI callbacks etc.

The functions introduced here will be used to rework the handling of
interrupts during suspend (hibernation) and resume.  Namely,
interrupts will only be disabled on the CPU right before suspending
sysdevs, while device drivers will be prevented from receiving
interrupts, with the help of the new helper function, before their
"late" suspend callbacks run (and analogously during resume).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/Makefile    |  1 +
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    | 31 +++++++++++++++-----
 kernel/irq/pm.c        | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 7 deletions(-)
 create mode 100644 kernel/irq/pm.c

(limited to 'kernel')

diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1e..3394f8f 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ee1aa9f..01ce20e 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,6 +12,8 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99..1516ab7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -162,6 +162,20 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 }
 #endif
 
+void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+{
+	if (suspend) {
+		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+			return;
+		desc->status |= IRQ_SUSPENDED;
+	}
+
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->chip->disable(irq);
+	}
+}
+
 /**
  *	disable_irq_nosync - disable an irq without waiting
  *	@irq: Interrupt to disable
@@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	if (!desc->depth++) {
-		desc->status |= IRQ_DISABLED;
-		desc->chip->disable(irq);
-	}
+	__disable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -215,15 +226,21 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
 
-static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
+	if (resume)
+		desc->status &= ~IRQ_SUSPENDED;
+
 	switch (desc->depth) {
 	case 0:
+ err_out:
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
 
+		if (desc->status & IRQ_SUSPENDED)
+			goto err_out;
 		/* Prevent probing on this irq: */
 		desc->status = status | IRQ_NOPROBE;
 		check_irq_resend(desc, irq);
@@ -253,7 +270,7 @@ void enable_irq(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	__enable_irq(desc, irq);
+	__enable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 */
 	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
 		desc->status &= ~IRQ_SPURIOUS_DISABLED;
-		__enable_irq(desc, irq);
+		__enable_irq(desc, irq, false);
 	}
 
 	spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 0000000..638d8be
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
+/*
+ * linux/kernel/irq/pm.c
+ *
+ * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file contains power management functions related to interrupts.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/**
+ * suspend_device_irqs - disable all currently enabled interrupt lines
+ *
+ * During system-wide suspend or hibernation device interrupts need to be
+ * disabled at the chip level and this function is provided for this purpose.
+ * It disables all interrupt lines that are enabled at the moment and sets the
+ * IRQ_SUSPENDED flag for them.
+ */
+void suspend_device_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		__disable_irq(desc, irq, true);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+
+	for_each_irq_desc(irq, desc)
+		if (desc->status & IRQ_SUSPENDED)
+			synchronize_irq(irq);
+}
+EXPORT_SYMBOL_GPL(suspend_device_irqs);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all interrupt lines previously disabled by suspend_device_irqs() that
+ * have the IRQ_SUSPENDED flag set.
+ */
+void resume_device_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+
+		if (!(desc->status & IRQ_SUSPENDED))
+			continue;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		__enable_irq(desc, irq, true);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+EXPORT_SYMBOL_GPL(resume_device_irqs);
+
+/**
+ * check_wakeup_irqs - check if any wake-up interrupts are pending
+ */
+int check_wakeup_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc)
+		if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+			return -EBUSY;
+
+	return 0;
+}
-- 
cgit v1.1


From 2ed8d2b3a81bdbb0418301628ccdb008ac9f40b7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:06 +0100
Subject: PM: Rework handling of interrupts during suspend-resume

Use the functions introduced in by the previous patch,
suspend_device_irqs(), resume_device_irqs() and check_wakeup_irqs(),
to rework the handling of interrupts during suspend (hibernation) and
resume.  Namely, interrupts will only be disabled on the CPU right
before suspending sysdevs, while device drivers will be prevented
from receiving interrupts, with the help of the new helper function,
before their "late" suspend callbacks run (and analogously during
resume).

In addition, since the device interrups are now disabled before the
CPU has turned all interrupts off and the CPU will ACK the interrupts
setting the IRQ_PENDING bit for them, check in sysdev_suspend() if
any wake-up interrupts are pending and abort suspend if that's the
case.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kexec.c      |  8 ++++----
 kernel/power/disk.c | 39 +++++++++++++++++++++++++++++----------
 kernel/power/main.c | 17 +++++++++++------
 3 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index c7fd669..dade9af 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1454,7 +1454,6 @@ int kernel_kexec(void)
 		if (error)
 			goto Resume_devices;
 		device_pm_lock();
-		local_irq_disable();
 		/* At this point, device_suspend() has been called,
 		 * but *not* device_power_down(). We *must*
 		 * device_power_down() now.  Otherwise, drivers for
@@ -1464,8 +1463,9 @@ int kernel_kexec(void)
 		 */
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
-			goto Enable_irqs;
+			goto Unlock_pm;
 
+		local_irq_disable();
 		/* Suspend system devices */
 		error = sysdev_suspend(PMSG_FREEZE);
 		if (error)
@@ -1484,9 +1484,9 @@ int kernel_kexec(void)
 	if (kexec_image->preserve_context) {
 		sysdev_resume();
  Power_up_devices:
-		device_power_up(PMSG_RESTORE);
- Enable_irqs:
 		local_irq_enable();
+		device_power_up(PMSG_RESTORE);
+ Unlock_pm:
 		device_pm_unlock();
 		enable_nonboot_cpus();
  Resume_devices:
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206..320bb09 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -214,7 +214,7 @@ static int create_image(int platform_mode)
 		return error;
 
 	device_pm_lock();
-	local_irq_disable();
+
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
@@ -225,8 +225,11 @@ static int create_image(int platform_mode)
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
-		goto Enable_irqs;
+		goto Unlock;
 	}
+
+	local_irq_disable();
+
 	sysdev_suspend(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -252,12 +255,16 @@ static int create_image(int platform_mode)
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
+
  Power_up_devices:
+	local_irq_enable();
+
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- Enable_irqs:
-	local_irq_enable();
+
+ Unlock:
 	device_pm_unlock();
+
 	return error;
 }
 
@@ -336,13 +343,16 @@ static int resume_target_kernel(void)
 	int error;
 
 	device_pm_lock();
-	local_irq_disable();
+
 	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
-		goto Enable_irqs;
+		goto Unlock;
 	}
+
+	local_irq_disable();
+
 	sysdev_suspend(PMSG_QUIESCE);
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
@@ -366,11 +376,16 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
+
 	sysdev_resume();
-	device_power_up(PMSG_RECOVER);
- Enable_irqs:
+
 	local_irq_enable();
+
+	device_power_up(PMSG_RECOVER);
+
+ Unlock:
 	device_pm_unlock();
+
 	return error;
 }
 
@@ -447,15 +462,16 @@ int hibernation_platform_enter(void)
 		goto Finish;
 
 	device_pm_lock();
-	local_irq_disable();
+
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
+		local_irq_disable();
 		sysdev_suspend(PMSG_HIBERNATE);
 		hibernation_ops->enter();
 		/* We should never get here */
 		while (1);
 	}
-	local_irq_enable();
+
 	device_pm_unlock();
 
 	/*
@@ -464,12 +480,15 @@ int hibernation_platform_enter(void)
 	 */
  Finish:
 	hibernation_ops->finish();
+
  Resume_devices:
 	entering_platform_hibernation = false;
 	device_resume(PMSG_RESTORE);
 	resume_console();
+
  Close:
 	hibernation_ops->end();
+
 	return error;
 }
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c9632f8..f0a4667 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -287,17 +287,19 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
  */
 static int suspend_enter(suspend_state_t state)
 {
-	int error = 0;
+	int error;
 
 	device_pm_lock();
-	arch_suspend_disable_irqs();
-	BUG_ON(!irqs_disabled());
 
-	if ((error = device_power_down(PMSG_SUSPEND))) {
+	error = device_power_down(PMSG_SUSPEND);
+	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Done;
 	}
 
+	arch_suspend_disable_irqs();
+	BUG_ON(!irqs_disabled());
+
 	error = sysdev_suspend(PMSG_SUSPEND);
 	if (!error) {
 		if (!suspend_test(TEST_CORE))
@@ -305,11 +307,14 @@ static int suspend_enter(suspend_state_t state)
 		sysdev_resume();
 	}
 
-	device_power_up(PMSG_RESUME);
- Done:
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
+
+	device_power_up(PMSG_RESUME);
+
+ Done:
 	device_pm_unlock();
+
 	return error;
 }
 
-- 
cgit v1.1


From 900af0d973856d6feb6fc088c2d0d3fde57707d3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:15 +0100
Subject: PM: Change suspend code ordering

Change the ordering of the suspend core code so that the platform
"prepare" callback is executed and the nonboot CPUs are disabled
after calling device drivers' "late suspend" methods.

This change will allow us to rework the PCI PM core so that the power
state of devices is changed in the "late" phase of suspend (and
analogously in the "early" phase of resume), which in turn will allow
us to avoid the race condition where a device using shared interrupts
is put into a low power state with interrupts enabled and then an
interrupt (for another device) comes in and confuses its driver.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/main.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index f0a4667..f172f41 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -297,6 +297,19 @@ static int suspend_enter(suspend_state_t state)
 		goto Done;
 	}
 
+	if (suspend_ops->prepare) {
+		error = suspend_ops->prepare();
+		if (error)
+			goto Power_up_devices;
+	}
+
+	if (suspend_test(TEST_PLATFORM))
+		goto Platfrom_finish;
+
+	error = disable_nonboot_cpus();
+	if (error || suspend_test(TEST_CPUS))
+		goto Enable_cpus;
+
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
@@ -310,6 +323,14 @@ static int suspend_enter(suspend_state_t state)
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Platfrom_finish:
+	if (suspend_ops->finish)
+		suspend_ops->finish();
+
+ Power_up_devices:
 	device_power_up(PMSG_RESUME);
 
  Done:
@@ -346,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_test(TEST_DEVICES))
 		goto Recover_platform;
 
-	if (suspend_ops->prepare) {
-		error = suspend_ops->prepare();
-		if (error)
-			goto Resume_devices;
-	}
-
-	if (suspend_test(TEST_PLATFORM))
-		goto Finish;
+	suspend_enter(state);
 
-	error = disable_nonboot_cpus();
-	if (!error && !suspend_test(TEST_CPUS))
-		suspend_enter(state);
-
-	enable_nonboot_cpus();
- Finish:
-	if (suspend_ops->finish)
-		suspend_ops->finish();
  Resume_devices:
 	suspend_test_start();
 	device_resume(PMSG_RESUME);
-- 
cgit v1.1


From 4aecd6718939eb3c4145b248369b65f7483a8a02 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:26 +0100
Subject: PM: Change hibernation code ordering

Change the ordering of the hibernation core code so that the platform
"prepare" callbacks are executed and the nonboot CPUs are disabled
after calling device drivers' "late suspend" methods.

This change (along with the previous analogous change of the suspend
core code) will allow us to rework the PCI PM core so that the power
state of devices is changed in the "late" phase of suspend (and
analogously in the "early" phase of resume), which in turn will allow
us to avoid the race condition where a device using shared interrupts
is put into a low power state with interrupts enabled and then an
interrupt (for another device) comes in and confuses its driver.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/disk.c | 109 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 320bb09..e886d13 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -228,13 +228,22 @@ static int create_image(int platform_mode)
 		goto Unlock;
 	}
 
+	error = platform_pre_snapshot(platform_mode);
+	if (error || hibernation_test(TEST_PLATFORM))
+		goto Platform_finish;
+
+	error = disable_nonboot_cpus();
+	if (error || hibernation_test(TEST_CPUS)
+	    || hibernation_testmode(HIBERNATION_TEST))
+		goto Enable_cpus;
+
 	local_irq_disable();
 
 	sysdev_suspend(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
-		goto Power_up_devices;
+		goto Enable_irqs;
 	}
 
 	if (hibernation_test(TEST_CORE))
@@ -250,15 +259,22 @@ static int create_image(int platform_mode)
 	restore_processor_state();
 	if (!in_suspend)
 		platform_leave(platform_mode);
+
  Power_up:
 	sysdev_resume();
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 
- Power_up_devices:
+ Enable_irqs:
 	local_irq_enable();
 
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Platform_finish:
+	platform_finish(platform_mode);
+
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
@@ -298,25 +314,9 @@ int hibernation_snapshot(int platform_mode)
 	if (hibernation_test(TEST_DEVICES))
 		goto Recover_platform;
 
-	error = platform_pre_snapshot(platform_mode);
-	if (error || hibernation_test(TEST_PLATFORM))
-		goto Finish;
-
-	error = disable_nonboot_cpus();
-	if (!error) {
-		if (hibernation_test(TEST_CPUS))
-			goto Enable_cpus;
-
-		if (hibernation_testmode(HIBERNATION_TEST))
-			goto Enable_cpus;
+	error = create_image(platform_mode);
+	/* Control returns here after successful restore */
 
-		error = create_image(platform_mode);
-		/* Control returns here after successful restore */
-	}
- Enable_cpus:
-	enable_nonboot_cpus();
- Finish:
-	platform_finish(platform_mode);
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
@@ -338,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
  *	kernel.
  */
 
-static int resume_target_kernel(void)
+static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
@@ -351,9 +351,20 @@ static int resume_target_kernel(void)
 		goto Unlock;
 	}
 
+	error = platform_pre_restore(platform_mode);
+	if (error)
+		goto Cleanup;
+
+	error = disable_nonboot_cpus();
+	if (error)
+		goto Enable_cpus;
+
 	local_irq_disable();
 
-	sysdev_suspend(PMSG_QUIESCE);
+	error = sysdev_suspend(PMSG_QUIESCE);
+	if (error)
+		goto Enable_irqs;
+
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = restore_highmem();
@@ -379,8 +390,15 @@ static int resume_target_kernel(void)
 
 	sysdev_resume();
 
+ Enable_irqs:
 	local_irq_enable();
 
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Cleanup:
+	platform_restore_cleanup(platform_mode);
+
 	device_power_up(PMSG_RECOVER);
 
  Unlock:
@@ -405,19 +423,10 @@ int hibernation_restore(int platform_mode)
 	pm_prepare_console();
 	suspend_console();
 	error = device_suspend(PMSG_QUIESCE);
-	if (error)
-		goto Finish;
-
-	error = platform_pre_restore(platform_mode);
 	if (!error) {
-		error = disable_nonboot_cpus();
-		if (!error)
-			error = resume_target_kernel();
-		enable_nonboot_cpus();
+		error = resume_target_kernel(platform_mode);
+		device_resume(PMSG_RECOVER);
 	}
-	platform_restore_cleanup(platform_mode);
-	device_resume(PMSG_RECOVER);
- Finish:
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -453,34 +462,38 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
+	device_pm_lock();
+
+	error = device_power_down(PMSG_HIBERNATE);
+	if (error)
+		goto Unlock;
+
 	error = hibernation_ops->prepare();
 	if (error)
-		goto Resume_devices;
+		goto Platofrm_finish;
 
 	error = disable_nonboot_cpus();
 	if (error)
-		goto Finish;
-
-	device_pm_lock();
-
-	error = device_power_down(PMSG_HIBERNATE);
-	if (!error) {
-		local_irq_disable();
-		sysdev_suspend(PMSG_HIBERNATE);
-		hibernation_ops->enter();
-		/* We should never get here */
-		while (1);
-	}
+		goto Platofrm_finish;
 
-	device_pm_unlock();
+	local_irq_disable();
+	sysdev_suspend(PMSG_HIBERNATE);
+	hibernation_ops->enter();
+	/* We should never get here */
+	while (1);
 
 	/*
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
 	 * the system is going to be halted anyway.
 	 */
- Finish:
+ Platofrm_finish:
 	hibernation_ops->finish();
 
+	device_power_up(PMSG_RESTORE);
+
+ Unlock:
+	device_pm_unlock();
+
  Resume_devices:
 	entering_platform_hibernation = false;
 	device_resume(PMSG_RESTORE);
-- 
cgit v1.1


From 749b0afc3a9d90dda3319fd1464a3b32fc225361 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:35 +0100
Subject: kexec: Change kexec jump code ordering

Change the ordering of the kexec jump code so that the nonboot CPUs
are disabled after calling device drivers' "late suspend" methods.

This change reflects the recent modifications of the power management
code that is also used by kexec jump.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kexec.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index dade9af..93eed85 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1450,9 +1450,6 @@ int kernel_kexec(void)
 		error = device_suspend(PMSG_FREEZE);
 		if (error)
 			goto Resume_console;
-		error = disable_nonboot_cpus();
-		if (error)
-			goto Resume_devices;
 		device_pm_lock();
 		/* At this point, device_suspend() has been called,
 		 * but *not* device_power_down(). We *must*
@@ -1463,13 +1460,15 @@ int kernel_kexec(void)
 		 */
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
-			goto Unlock_pm;
-
+			goto Resume_devices;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Enable_cpus;
 		local_irq_disable();
 		/* Suspend system devices */
 		error = sysdev_suspend(PMSG_FREEZE);
 		if (error)
-			goto Power_up_devices;
+			goto Enable_irqs;
 	} else
 #endif
 	{
@@ -1483,13 +1482,13 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
 		sysdev_resume();
- Power_up_devices:
+ Enable_irqs:
 		local_irq_enable();
-		device_power_up(PMSG_RESTORE);
- Unlock_pm:
-		device_pm_unlock();
+ Enable_cpus:
 		enable_nonboot_cpus();
+		device_power_up(PMSG_RESTORE);
  Resume_devices:
+		device_pm_unlock();
 		device_resume(PMSG_RESTORE);
  Resume_console:
 		resume_console();
-- 
cgit v1.1


From 2f8501815256af8498904e68bd0984b1afffd6f8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Mar 2009 11:13:20 +0100
Subject: lockdep: fix deadlock in lockdep_trace_alloc

Heiko reported that we grab the graph lock with irqs enabled.

Fix this by providng the same wrapper as all other lockdep entry
functions have.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <npiggin@suse.de>
LKML-Reference: <1237544000.24626.52.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 022d2ed..3673a3f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2260,7 +2260,7 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(&redundant_softirqs_off);
 }
 
-void lockdep_trace_alloc(gfp_t gfp_mask)
+static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 {
 	struct task_struct *curr = current;
 
@@ -2279,12 +2279,29 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
 	if (!(gfp_mask & __GFP_FS))
 		return;
 
-	if (DEBUG_LOCKS_WARN_ON(irqs_disabled()))
+	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
 		return;
 
 	mark_held_locks(curr, RECLAIM_FS);
 }
 
+static void check_flags(unsigned long flags);
+
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lockdep_trace_alloc(gfp_mask, flags);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
-- 
cgit v1.1


From f69b17d7e745d8edd7c0d90390cbaa77e63c5ea3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 17:40:06 +0800
Subject: rcu: rcu_barrier VS cpu_hotplug: Ensure callbacks in dead cpu are
 migrated to online cpu

cpu hotplug may happen asynchronously, some rcu callbacks are maybe
still on dead cpu, rcu_barrier() also needs to wait for these rcu
callbacks to complete, so we must ensure callbacks in dead cpu are
migrated to online cpu.

Paul E. McKenney's review:

  Good stuff, Lai!!!  Simpler than any of the approaches that I was
  considering, and, better yet, independent of the underlying RCU
  implementation!!!

  I was initially worried that wake_up() might wake only one of two
  possible wait_event()s, namely rcu_barrier() and the CPU_POST_DEAD code,
  but the fact that wait_event() clears WQ_FLAG_EXCLUSIVE avoids that issue.
  I was also worried about the fact that different RCU implementations have
  different mappings of call_rcu(), call_rcu_bh(), and call_rcu_sched(), but
  this is OK as well because we just get an extra (harmless) callback in the
  case that they map together (for example, Classic RCU has call_rcu_sched()
  mapping to call_rcu()).

  Overlap of CPU-hotplug operations is prevented by cpu_add_remove_lock,
  and any stray callbacks that arrive (for example, from irq handlers
  running on the dying CPU) either are ahead of the CPU_DYING callbacks on
  the one hand (and thus accounted for), or happened after the rcu_barrier()
  started on the other (and thus don't need to be accounted for).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49C36476.1010400@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cae8a05..2c7b845 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
 	}
 }
 
+static inline void wait_migrated_callbacks(void);
+
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
+	wait_migrated_callbacks();
 }
 
 /**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
+
+static void rcu_migrate_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_migrate_type_count))
+		wake_up(&rcu_migrate_wq);
+}
+
+static inline void wait_migrated_callbacks(void)
+{
+	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+}
+
+static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
+		unsigned long action, void *hcpu)
+{
+	if (action == CPU_DYING) {
+		/*
+		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
+		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+		 * returns, all online cpus have queued rcu_barrier_func(),
+		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
+		 *
+		 * These callbacks ensure _rcu_barrier() waits for all
+		 * RCU callbacks of the specified type to complete.
+		 */
+		atomic_set(&rcu_migrate_type_count, 3);
+		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
+		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
+		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+	} else if (action == CPU_POST_DEAD) {
+		/* rcu_migrate_head is protected by cpu_add_remove_lock */
+		wait_migrated_callbacks();
+	}
+
+	return NOTIFY_OK;
+}
+
 void __init rcu_init(void)
 {
 	__rcu_init();
+	hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
 }
 
 void rcu_scheduler_starting(void)
-- 
cgit v1.1


From e180a6b7759a99a28cbcce3547c4c80822cb6c2a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:29 -0600
Subject: param: fix charp parameters set via sysfs

Impact: fix crash on reading from /sys/module/.../ieee80211_default_rc_algo

The module_param type "charp" simply sets a char * pointer in the
module to the parameter in the commandline string: this is why we keep
the (mangled) module command line around.  But when set via sysfs (as
about 11 charp parameters can be) this memory is freed on the way
out of the write().  Future reads hit random mem.

So we kstrdup instead: we have to check we're not in early commandline
parsing, and we have to note when we've used it so we can reliably
kfree the parameter when it's next overwritten, and also on module
unload.

(Thanks to Randy Dunlap for CONFIG_SYSFS=n fixes)

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Diagnosed-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 14 ++++++++------
 kernel/params.c | 26 +++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f77ac32..b862fdb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1491,6 +1491,9 @@ static void free_module(struct module *mod)
 	/* Module unload stuff */
 	module_unload_free(mod);
 
+	/* Free any allocated parameters. */
+	destroy_params(mod->kp, mod->num_kp);
+
 	/* release any pointers to mcount in this module */
 	ftrace_release(mod->module_core, mod->core_size);
 
@@ -1898,8 +1901,7 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_kp, num_mcount;
-	struct kernel_param *kp;
+	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2144,8 +2146,8 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
-	kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
-			  &num_kp);
+	mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
 	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
 				 sizeof(*mod->syms), &mod->num_syms);
 	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2291,11 +2293,11 @@ static noinline struct module *load_module(void __user *umod,
 	 */
 	list_add_rcu(&mod->list, &modules);
 
-	err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod, kp, num_kp);
+	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025..de273ec 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
+/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
+#define KPARAM_KMALLOCED	0x80000000
+
 #if 0
 #define DEBUGP printk
 #else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	*(char **)kp->arg = (char *)val;
+	if (kp->perm & KPARAM_KMALLOCED)
+		kfree(*(char **)kp->arg);
+
+	/* This is a hack.  We can't need to strdup in early boot, and we
+	 * don't need to; this mangled commandline is preserved. */
+	if (slab_is_available()) {
+		kp->perm |= KPARAM_KMALLOCED;
+		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+		if (!kp->arg)
+			return -ENOMEM;
+	} else
+		*(const char **)kp->arg = val;
+
 	return 0;
 }
 
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
 }
 #endif
 
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].perm & KPARAM_KMALLOCED)
+			kfree(*(char **)params[i].arg);
+}
+
 static void __init kernel_add_sysfs_param(const char *name,
 					  struct kernel_param *kparam,
 					  unsigned int name_skip)
-- 
cgit v1.1


From b10153fe31dde3805f8320b61ef147cebe379aee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Am=C3=A9rico=20Wang?= <xiyou.wangcong@gmail.com>
Date: Wed, 25 Mar 2009 00:07:19 +0800
Subject: kernel/module.c: fix an unused goto label

Impact: cleanup

Label 'free_init' is only used when defined(CONFIG_MODULE_UNLOAD) &&
defined(CONFIG_SMP), so move it inside to shut up gcc.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b862fdb..7af72bb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2319,8 +2319,8 @@ static noinline struct module *load_module(void __user *umod,
 	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
- free_init:
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ free_init:
 	percpu_modfree(mod->refptr);
 #endif
 	module_free(mod, mod->module_init);
-- 
cgit v1.1


From 414fd31b2553aaf160ca9b9afe45aa0372b01b92 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:56 -0500
Subject: module: Make find_symbol return a struct kernel_symbol

Impact: Cleanup, internal API change

Ksplice needs access to the kernel_symbol structure in order to support
modifications to the exported symbol table.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bugfix and style)
---
 kernel/module.c | 75 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 7af72bb..2f0fddf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -283,7 +283,7 @@ struct find_symbol_arg {
 	/* Output */
 	struct module *owner;
 	const unsigned long *crc;
-	unsigned long value;
+	const struct kernel_symbol *sym;
 };
 
 static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +324,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
-	fsa->value = syms->start[symnum].value;
+	fsa->sym = &syms->start[symnum];
 	return true;
 }
 
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
-				 struct module **owner,
-				 const unsigned long **crc,
-				 bool gplok,
-				 bool warn)
+/* Find a symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it */
+static const struct kernel_symbol *find_symbol(const char *name,
+					       struct module **owner,
+					       const unsigned long **crc,
+					       bool gplok,
+					       bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -347,11 +347,11 @@ static unsigned long find_symbol(const char *name,
 			*owner = fsa.owner;
 		if (crc)
 			*crc = fsa.crc;
-		return fsa.value;
+		return fsa.sym;
 	}
 
 	DEBUGP("Failed to find symbol %s\n", name);
-	return -ENOENT;
+	return NULL;
 }
 
 /* Search for module by name: must hold module_mutex. */
@@ -894,7 +894,7 @@ void __symbol_put(const char *symbol)
 	struct module *owner;
 
 	preempt_disable();
-	if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
+	if (!find_symbol(symbol, &owner, NULL, true, false))
 		BUG();
 	module_put(owner);
 	preempt_enable();
@@ -1057,7 +1057,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
+	if (!find_symbol("struct_module", NULL, &crc, true, false))
 		BUG();
 	return check_version(sechdrs, versindex, "struct_module", mod, crc);
 }
@@ -1098,25 +1098,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 
 /* Resolve a symbol for this module.  I.e. if we find one, record usage.
    Must be holding module_mutex. */
-static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
-				    unsigned int versindex,
-				    const char *name,
-				    struct module *mod)
+static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
+						  unsigned int versindex,
+						  const char *name,
+						  struct module *mod)
 {
 	struct module *owner;
-	unsigned long ret;
+	const struct kernel_symbol *sym;
 	const unsigned long *crc;
 
-	ret = find_symbol(name, &owner, &crc,
+	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	if (!IS_ERR_VALUE(ret)) {
-		/* use_module can fail due to OOM,
-		   or module initialization or unloading */
+	/* use_module can fail due to OOM,
+	   or module initialization or unloading */
+	if (sym) {
 		if (!check_version(sechdrs, versindex, name, mod, crc) ||
 		    !use_module(mod, owner))
-			ret = -EINVAL;
+			sym = NULL;
 	}
-	return ret;
+	return sym;
 }
 
 /*
@@ -1516,17 +1516,15 @@ static void free_module(struct module *mod)
 void *__symbol_get(const char *symbol)
 {
 	struct module *owner;
-	unsigned long value;
+	const struct kernel_symbol *sym;
 
 	preempt_disable();
-	value = find_symbol(symbol, &owner, NULL, true, true);
-	if (IS_ERR_VALUE(value))
-		value = 0;
-	else if (strong_try_module_get(owner))
-		value = 0;
+	sym = find_symbol(symbol, &owner, NULL, true, true);
+	if (sym && strong_try_module_get(owner))
+		sym = NULL;
 	preempt_enable();
 
-	return (void *)value;
+	return sym ? (void *)sym->value : NULL;
 }
 EXPORT_SYMBOL_GPL(__symbol_get);
 
@@ -1554,8 +1552,7 @@ static int verify_export_symbols(struct module *mod)
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
-						      NULL, true, false))) {
+			if (find_symbol(s->name, &owner, NULL, true, false)) {
 				printk(KERN_ERR
 				       "%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
@@ -1579,6 +1576,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 	unsigned long secbase;
 	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
 	int ret = 0;
+	const struct kernel_symbol *ksym;
 
 	for (i = 1; i < n; i++) {
 		switch (sym[i].st_shndx) {
@@ -1598,13 +1596,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 			break;
 
 		case SHN_UNDEF:
-			sym[i].st_value
-			  = resolve_symbol(sechdrs, versindex,
-					   strtab + sym[i].st_name, mod);
-
+			ksym = resolve_symbol(sechdrs, versindex,
+					      strtab + sym[i].st_name, mod);
 			/* Ok if resolved.  */
-			if (!IS_ERR_VALUE(sym[i].st_value))
+			if (ksym) {
+				sym[i].st_value = ksym->value;
 				break;
+			}
+
 			/* Ok if weak.  */
 			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
 				break;
-- 
cgit v1.1


From e610499e2656e61975affd0af56b26eb73964c84 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: __module_address

Impact: New API, cleanup

ksplice wants to know the bounds of a module, not just the module text.

It makes sense to have __module_address.  We then implement
is_module_address and __module_text_address in terms of this (and
change is_module_text_address() to bool while we're at it).

Also, add proper kerneldoc for them all.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 76 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2f0fddf..bd15a94 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -76,7 +76,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_text_address */
+/* Bounds of module allocation, for speeding __module_address */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -2745,29 +2745,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 }
 
 /*
- * Is this a valid module address?
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
  */
-int is_module_address(unsigned long addr)
+bool is_module_address(unsigned long addr)
 {
-	struct module *mod;
+	bool ret;
 
 	preempt_disable();
-
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within_module_core(addr, mod)) {
-			preempt_enable();
-			return 1;
-		}
-	}
-
+	ret = __module_address(addr) != NULL;
 	preempt_enable();
 
-	return 0;
+	return ret;
 }
 
-
-/* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+__notrace_funcgraph struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
 
@@ -2775,12 +2777,50 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 		return NULL;
 
 	list_for_each_entry_rcu(mod, &modules, list)
-		if (within(addr, mod->module_init, mod->init_text_size)
-		    || within(addr, mod->module_core, mod->core_text_size))
+		if (within_module_core(addr, mod)
+		    || within_module_init(addr, mod))
 			return mod;
 	return NULL;
 }
 
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module.  See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_text_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod = __module_address(addr);
+	if (mod) {
+		/* Make sure it's within the text section. */
+		if (!within(addr, mod->module_init, mod->init_text_size)
+		    && !within(addr, mod->module_core, mod->core_text_size))
+			mod = NULL;
+	}
+	return mod;
+}
+
 struct module *module_text_address(unsigned long addr)
 {
 	struct module *mod;
-- 
cgit v1.1


From a6e6abd575fcbe6572ebc7a70ad616406d206fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: remove module_text_address()

Impact: Replace and remove risky (non-EXPORTed) API

module_text_address() returns a pointer to the module, which given locking
improvements in module.c, is useless except to test for NULL:

1) If the module can't go away, use __module_text_address.
2) Otherwise, just use is_module_text_address().

Cc: linux-mtd@lists.infradead.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/extable.c |  6 +++---
 kernel/module.c  | 17 ++++-------------
 2 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8..384f0da 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -58,14 +58,14 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 int kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 /*
@@ -81,5 +81,5 @@ int func_ptr_is_kernel_text(void *ptr)
 	addr = (unsigned long) dereference_function_descriptor(ptr);
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
diff --git a/kernel/module.c b/kernel/module.c
index bd15a94..8ddca62 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -908,8 +908,10 @@ void symbol_put_addr(void *addr)
 	if (core_kernel_text((unsigned long)addr))
 		return;
 
-	if (!(modaddr = module_text_address((unsigned long)addr)))
-		BUG();
+	/* module_text_address is safe here: we're supposed to have reference
+	 * to module from symbol_get, so it can't go away. */
+	modaddr = __module_text_address((unsigned long)addr);
+	BUG_ON(!modaddr);
 	module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -2821,17 +2823,6 @@ struct module *__module_text_address(unsigned long addr)
 	return mod;
 }
 
-struct module *module_text_address(unsigned long addr)
-{
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_text_address(addr);
-	preempt_enable();
-
-	return mod;
-}
-
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
-- 
cgit v1.1


From 75a66614db21007bcc8c37f9c5d5b922981387b9 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@mit.edu>
Date: Fri, 5 Dec 2008 19:03:58 -0500
Subject: Ksplice: Add functions for walking kallsyms symbols

Impact: New API

kallsyms_lookup_name only returns the first match that it finds.  Ksplice
needs information about all symbols with a given name in order to correctly
resolve local symbols.

kallsyms_on_each_symbol provides a generic mechanism for iterating over the
kallsyms table.

Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/kallsyms.c | 19 +++++++++++++++++++
 kernel/module.c   | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f2..374faf9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data)
+{
+	char namebuf[KSYM_NAME_LEN];
+	unsigned long i;
+	unsigned int off;
+	int ret;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+		ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+		if (ret != 0)
+			return ret;
+	}
+	return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
 				    unsigned long *offset)
diff --git a/kernel/module.c b/kernel/module.c
index 8ddca62..dd4389b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2612,6 +2612,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	preempt_enable();
 	return ret;
 }
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret;
+
+	list_for_each_entry(mod, &modules, list) {
+		for (i = 0; i < mod->num_symtab; i++) {
+			ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+				 mod, mod->symtab[i].st_value);
+			if (ret != 0)
+				return ret;
+		}
+	}
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 static char *module_flags(struct module *mod, char *buf)
-- 
cgit v1.1


From c6b37801911d7f4663c99cad8aa230bc934cea82 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:59 -0500
Subject: module: Export symbols needed for Ksplice

Impact: Expose some module.c symbols

Ksplice uses several functions from module.c in order to resolve
symbols and implement dependency handling.  Calling these functions
requires holding module_mutex, so it is exported.

(This is just the module part of a bigger add-exports patch from Tim).

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index dd4389b..5fd0076 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
 
 /* List of modules, protected by module_mutex or preempt_disable
  * (delete uses stop_machine/add uses RCU list operations). */
-static DEFINE_MUTEX(module_mutex);
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
 /* Waiting for a module to finish initializing? */
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const unsigned long *crcs;
-	enum {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} licence;
-	bool unused;
-};
-
 static bool each_symbol_in_section(const struct symsearch *arr,
 				   unsigned int arrsize,
 				   struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
 }
 
 /* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-				   struct module *owner,
-				   unsigned int symnum, void *data),
-			void *data)
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data)
 {
 	struct module *mod;
 	const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 	}
 	return false;
 }
+EXPORT_SYMBOL_GPL(each_symbol);
 
 struct find_symbol_arg {
 	/* Input */
@@ -330,11 +319,11 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 /* Find a symbol and return it, along with, (optional) crc and
  * (optional) module which owns it */
-static const struct kernel_symbol *find_symbol(const char *name,
-					       struct module **owner,
-					       const unsigned long **crc,
-					       bool gplok,
-					       bool warn)
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -353,9 +342,10 @@ static const struct kernel_symbol *find_symbol(const char *name,
 	DEBUGP("Failed to find symbol %s\n", name);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_symbol);
 
 /* Search for module by name: must hold module_mutex. */
-static struct module *find_module(const char *name)
+struct module *find_module(const char *name)
 {
 	struct module *mod;
 
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
 }
 
 /* Module a uses b */
-static int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
 	int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 1;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -951,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-static inline int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	return strong_try_module_get(b) == 0;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 static inline void module_unload_init(struct module *mod)
 {
@@ -2803,6 +2796,7 @@ __notrace_funcgraph struct module *__module_address(unsigned long addr)
 			return mod;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__module_address);
 
 /*
  * is_module_text_address - is this address inside module code?
@@ -2841,6 +2835,7 @@ struct module *__module_text_address(unsigned long addr)
 	}
 	return mod;
 }
+EXPORT_SYMBOL_GPL(__module_text_address);
 
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
-- 
cgit v1.1


From c6e665c8f0c18ab3686117905765b5139efd6ebd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:33 -0600
Subject: module: clarify the force-loading taint message.

Impact: Message cleanup

Two of three callers of try_to_force_load() are not because of a
missing version, so change the messages:

Old:
	<modname>: no version for "magic" found: kernel tainted.
New:
	<modname>: bad vermagic: kernel tainted.

Old:
	<modname>: no version for "nocrc" found: kernel tainted.
New:
	<modname>: no versions for exported symbols: kernel tainted.

Old:
	<modname>: no version for "<symname>" found: kernel tainted.
New:
	<modname>: <symname>: kernel tainted.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5fd0076..599fc85 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -990,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
 
 static const char vermagic[] = VERMAGIC_STRING;
 
-static int try_to_force_load(struct module *mod, const char *symname)
+static int try_to_force_load(struct module *mod, const char *reason)
 {
 #ifdef CONFIG_MODULE_FORCE_LOAD
 	if (!test_taint(TAINT_FORCED_MODULE))
-		printk("%s: no version for \"%s\" found: kernel tainted.\n",
-		       mod->name, symname);
+		printk(KERN_WARNING "%s: %s: kernel tainted.\n",
+		       mod->name, reason);
 	add_taint_module(mod, TAINT_FORCED_MODULE);
 	return 0;
 #else
@@ -2002,7 +2002,7 @@ static noinline struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		err = try_to_force_load(mod, "magic");
+		err = try_to_force_load(mod, "bad vermagic");
 		if (err)
 			goto free_hdr;
 	} else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2191,8 +2191,8 @@ static noinline struct module *load_module(void __user *umod,
 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
 #endif
 		) {
-		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
-		err = try_to_force_load(mod, "nocrc");
+		err = try_to_force_load(mod,
+					"no versions for exported symbols");
 		if (err)
 			goto cleanup;
 	}
-- 
cgit v1.1


From 9cb610d8e35fe3ec95a2fe2030b02f85aeea83c1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:33 -0600
Subject: module: remove the SHF_ALLOC flag on the __versions section.

Impact: reduce kernel memory usage

This patch just takes off the SHF_ALLOC flag on __versions so we don't
keep them around after module load.

This saves about 7% of module memory if CONFIG_MODVERSIONS=y.

Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 599fc85..784bf6d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1961,6 +1961,9 @@ static noinline struct module *load_module(void __user *umod,
 		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
+		/* Don't keep __versions around; it's just for loading. */
+		if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
+			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
 
 	modindex = find_sec(hdr, sechdrs, secstrings,
-- 
cgit v1.1


From 8c8ef42aee8fcfb4128bb94c50d55c9f80ade525 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:34 -0600
Subject: module: include other structures in module version check

With CONFIG_MODVERSIONS, we version 'struct module' using a dummy
export, but other things matter too:

1) 'struct modversion_info' determines the layout of the __versions section,
2) 'struct kernel_param' determines the layout of the __params section,
3) 'struct kernel_symbol' determines __ksymtab*.
4) 'struct marker' determines __markers.
5) 'struct tracepoint' determines __tracepoints.

So we rename 'struct_module' to 'module_layout' and include these in
the signature.  Now it's general we can add others later on without
confusion.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 784bf6d..e8cf636 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1052,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	if (!find_symbol("struct_module", NULL, &crc, true, false))
+	if (!find_symbol("module_layout", NULL, &crc, true, false))
 		BUG();
-	return check_version(sechdrs, versindex, "struct_module", mod, crc);
+	return check_version(sechdrs, versindex, "module_layout", mod, crc);
 }
 
 /* First part is kernel version, which we ignore if module has crcs. */
@@ -2858,9 +2858,17 @@ void print_modules(void)
 }
 
 #ifdef CONFIG_MODVERSIONS
-/* Generate the signature for struct module here, too, for modversions. */
-void struct_module(struct module *mod) { return; }
-EXPORT_SYMBOL(struct_module);
+/* Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module. */
+void module_layout(struct module *mod,
+		   struct modversion_info *ver,
+		   struct kernel_param *kp,
+		   struct kernel_symbol *ks,
+		   struct marker *marker,
+		   struct tracepoint *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
 #endif
 
 #ifdef CONFIG_MARKERS
-- 
cgit v1.1


From acae05156551fd7528fbb616271e672789388e3c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 8 Feb 2009 10:42:01 -0800
Subject: module: create a request_module_nowait()

There seems to be a common pattern in the kernel where drivers want to
call request_module() from inside a module_init() function. Currently
this would deadlock.

As a result, several drivers go through hoops like scheduling things via
kevent, or creating custom work queues (because kevent can deadlock on them).

This patch changes this to use a request_module_nowait() function macro instead,
which just fires the modprobe off but doesn't wait for it, and thus avoids the
original deadlock entirely.

On my laptop this already results in one less kernel thread running..

(Includes Jiri's patch to use enum umh_wait)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bool-ified)
Cc: Jiri Slaby <jirislaby@gmail.com>
---
 kernel/kmod.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f54..b750675 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 /**
- * request_module - try to load a kernel module
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
  * @fmt: printf style format string for the name of the module
  * @...: arguments as specified in the format string
  *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int request_module(const char *fmt, ...)
+int __request_module(bool wait, const char *fmt, ...)
 {
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
 		return -ENOMEM;
 	}
 
-	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	ret = call_usermodehelper(modprobe_path, argv, envp,
+			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
-EXPORT_SYMBOL(request_module);
+EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
 struct subprocess_info {
-- 
cgit v1.1


From e91defa26c527ceeaff6266c55cdc7e17c9081a2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:35 -0600
Subject: module: don't use stop_machine on module load

Kay Sievers <kay.sievers@vrfy.org> discovered that boot times are slowed
by about half a second because all the stop_machine_create() calls,
and he only probes about 40 modules (I have 125 loaded on this laptop).

We only do stop_machine_create() so we can unlink the module if
something goes wrong, but it's overkill (and buggy anyway: if
stop_machine_create() fails we still call stop_machine_destroy()).

Since we are only protecting against kallsyms (esp. oops) walking the
list, synchronize_sched() is sufficient (synchronize_rcu() is probably
sufficient, but we're not in a hurry).

Kay says of this patch:
	... no module takes more than 40 millisecs to link now, most of
	them are between 3 and 8 millisecs.

	That looks very different to the numbers without this patch
	and the otherwise same setup, where we get heavy noise in the
	traces and many delays of up to 200 millisecs until linking,
	most of them taking 30+ millisecs.

Tested-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e8cf636..1a9a398 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1912,12 +1912,6 @@ static noinline struct module *load_module(void __user *umod,
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	/* Create stop_machine threads since the error path relies on
-	 * a non-failing stop_machine call. */
-	err = stop_machine_create();
-	if (err)
-		goto free_hdr;
-
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
 		goto free_hdr;
@@ -2303,12 +2297,13 @@ static noinline struct module *load_module(void __user *umod,
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
-	stop_machine_destroy();
 	/* Done! */
 	return mod;
 
  unlink:
-	stop_machine(__unlink_module, mod, NULL);
+	/* Unlink carefully: kallsyms could be walking list. */
+	list_del_rcu(&mod->list);
+	synchronize_sched();
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
@@ -2331,7 +2326,6 @@ static noinline struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
-	stop_machine_destroy();
 	return ERR_PTR(err);
 
  truncated:
-- 
cgit v1.1


From 49502677e11079c2e3e01867c922a894ce06a8be Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:36 -0600
Subject: module: use strstarts()

Impact: minor cleanup.

I'm not going to neaten anyone else's code, but I'm happy to clean up
my own.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1a9a398..f6e08b7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1673,8 +1673,7 @@ static void layout_sections(struct module *mod,
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strncmp(secstrings + s->sh_name,
-				       ".init", 5) == 0)
+			    || strstarts(secstrings + s->sh_name, ".init"))
 				continue;
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
 			DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1691,8 +1690,7 @@ static void layout_sections(struct module *mod,
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strncmp(secstrings + s->sh_name,
-				       ".init", 5) != 0)
+			    || !strstarts(secstrings + s->sh_name, ".init"))
 				continue;
 			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
@@ -1825,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
 		else
 			return 'b';
 	}
-	if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
-		    ".debug", strlen(".debug")) == 0)
+	if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
 		return 'n';
 	return '?';
 }
@@ -1952,7 +1949,7 @@ static noinline struct module *load_module(void __user *umod,
 		}
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 		/* Don't keep __versions around; it's just for loading. */
-- 
cgit v1.1


From 7f1e2ca9f04b02794597f60e7b1d43f0a1317939 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:27 +0100
Subject: hrtimer: fix rq->lock inversion (again)

It appears I inadvertly introduced rq->lock recursion to the
hrtimer_start() path when I delegated running already expired
timers to softirq context.

This patch fixes it by introducing a __hrtimer_start_range_ns()
method that will not use raise_softirq_irqoff() but
__raise_softirq_irqoff() which avoids the wakeup.

It then also changes schedule() to check for pending softirqs and
do the wakeup then, I'm not quite sure I like this last bit, nor
am I convinced its really needed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
LKML-Reference: <20090313112301.096138802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 55 ++++++++++++++++++++++++++++++++++---------------------
 kernel/sched.c   | 14 +++++++++++---
 kernel/softirq.c |  2 +-
 3 files changed, 46 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a..cb8a15c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-		spin_unlock(&base->cpu_base->lock);
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		spin_lock(&base->cpu_base->lock);
+		if (wakeup) {
+			spin_unlock(&base->cpu_base->lock);
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			spin_lock(&base->cpu_base->lock);
+		} else
+			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return 1;
 	}
+
 	return 0;
 }
 
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	return 0;
 }
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
-			const enum hrtimer_mode mode)
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode,
+		int wakeup)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 	 * XXX send_remote_softirq() ?
 	 */
 	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
-		hrtimer_enqueue_reprogram(timer, new_base);
+		hrtimer_enqueue_reprogram(timer, new_base, wakeup);
 
 	unlock_hrtimer_base(timer, &flags);
 
 	return ret;
 }
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
+	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48b..63256e3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
+		unsigned long delta;
+		ktime_t soft, hard;
+
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-		hrtimer_start_expires(&rt_b->rt_period_timer,
-				HRTIMER_MODE_ABS);
+
+		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+				HRTIMER_MODE_ABS, 0);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
-	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+			HRTIMER_MODE_REL, 0);
 }
 
 static inline void init_hrtick(void)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4877516..accc851 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+void wakeup_softirqd(void)
 {
 	/* Interrupts are disabled: no need to stop preemption */
 	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
-- 
cgit v1.1


From eedeeabdeeadb016b8c783e3620d06b98d0cb4e1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 18 Mar 2009 12:38:47 +0100
Subject: lockdep: add stack dumps to asserts

Have a better idea about exactly which loc causes a lockdep
limit overflow. Often it's a bug or inefficiency in that
subsystem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1237376327.5069.253.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 981cd48..a288ae1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -792,6 +792,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return NULL;
 	}
 	class = lock_classes + nr_lock_classes++;
@@ -855,6 +856,7 @@ static struct lock_list *alloc_list_entry(void)
 
 		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return NULL;
 	}
 	return list_entries + nr_list_entries++;
@@ -1681,6 +1683,7 @@ cache_hit:
 
 		printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 	chain = lock_chains + nr_lock_chains++;
@@ -2540,6 +2543,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		debug_locks_off();
 		printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 
@@ -2636,6 +2640,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		debug_locks_off();
 		printk("BUG: MAX_LOCK_DEPTH too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 
-- 
cgit v1.1


From 6c051ce0307526adec32a847f0daa1af2124f0a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:18:56 +0800
Subject: blktrace: fix timestamp in binary output

I found the timestamp is wrong:

 # echo bin > trace_option
 # echo blk > current_tracer
 # cat trace_pipe | blkparse -i -
 8,0    0        0     0.000000000   504  A   W ...
 ...
 8,7    1        0     0.008534097     0  C   R ...
            (should be 8.534097xxx)

user-space blkparse expects the timestamp to be nanosecond.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6fb274f..ee7a8bb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1168,7 +1168,7 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 	const int offset = offsetof(struct blk_io_trace, sector);
 	struct blk_io_trace old = {
 		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
-		.time     = ns2usecs(iter->ts),
+		.time     = iter->ts,
 	};
 
 	if (!trace_seq_putmem(s, &old, offset))
-- 
cgit v1.1


From b5230b56ee6caeb27cedb7753c0c319646383bb4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:19:33 +0800
Subject: blktrace: fix a race when creating blk_tree_root in debugfs

t1                                t2
------                            ------
do_blk_trace_setup()              do_blk_trace_setup()
  if (!blk_tree_root) {
                                    if (!blk_tree_root)
    blk_tree_root = create_dir()
                                      blk_tree_root = create_dir();
                                      (now blk_tree_root == NULL)
  ...
  dir = create_dir(name, blk_tree_root);

Due to this race, t1 will create 'dir' in /debugfs but not /debugfs/block.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ee7a8bb..95f89fa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -426,11 +426,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	ret = -ENOENT;
 
+	mutex_lock(&blk_tree_mutex);
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root)
+		if (!blk_tree_root) {
+			mutex_unlock(&blk_tree_mutex);
 			goto err;
+		}
 	}
+	mutex_unlock(&blk_tree_mutex);
 
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
 
-- 
cgit v1.1


From 5554720482a631702146a959db22fe417740e0a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:21:26 +0800
Subject: blktrace: fix the original blktrace

Currently the original blktrace, which is using relay and is used via
ioctl, is broken. You can use ftrace to see the output of blktrace,
but user-space blktrace is unusable.

It's broken by "blktrace: add ftrace plugin"
(c71a896154119f4ca9e89d6078f5f63ad60ef199)

 -	if (unlikely(bt->trace_state != Blktrace_running))
 +	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
		return;

With this patch, both ioctl and ftrace can be used, but of course you
can't use both of them at the same time.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95f89fa..a7f7ff5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -110,7 +110,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
-	if (blk_tr) {
+	if (blk_tracer_enabled) {
 		va_start(args, fmt);
 		ftrace_vprintk(fmt, args);
 		va_end(args);
@@ -169,7 +169,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running ||
+	if (unlikely(bt->trace_state != Blktrace_running &&
 		     !blk_tracer_enabled))
 		return;
 
@@ -185,7 +185,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 	cpu = raw_smp_processor_id();
 
-	if (blk_tr) {
+	if (blk_tracer_enabled) {
 		tracing_record_cmdline(current);
 
 		pc = preempt_count();
@@ -235,7 +235,7 @@ record_it:
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
-		if (blk_tr) {
+		if (blk_tracer_enabled) {
 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
@@ -267,8 +267,7 @@ int blk_trace_remove(struct request_queue *q)
 	if (!bt)
 		return -EINVAL;
 
-	if (bt->trace_state == Blktrace_setup ||
-	    bt->trace_state == Blktrace_stopped)
+	if (bt->trace_state != Blktrace_running)
 		blk_trace_cleanup(bt);
 
 	return 0;
@@ -1273,7 +1272,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
 	bt->end_lba = -1ULL;
-	bt->trace_state = Blktrace_running;
 
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
-- 
cgit v1.1


From eb08f8eb0673d9c1e62b69ad1b41593e73c40467 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:27 +0800
Subject: blktrace: fix off-by-one bug

'what' is used as the index of array what2act, so it can't >= the array size.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a7f7ff5..d43cdac 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1152,7 +1152,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 	if (!trace_print_context(iter))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
@@ -1199,7 +1199,7 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	t = (const struct blk_io_trace *)iter->ent;
 	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
 	else {
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-- 
cgit v1.1


From 35ac51bfe4c293b67ce9f85082ba0b9bc6123c40 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:19:46 +0800
Subject: blktrace: make classic output more classic

Impact: fix ftrace plugin timestamp output

In the classic user-space blktrace, the output timestamp is sec.nsec
not sec.usec.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d43cdac..5b28f0f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -994,8 +994,8 @@ static void get_pdu_remap(const struct trace_entry *ent,
 static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
-	unsigned long long ts  = ns2usecs(iter->ts);
-	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned long long ts  = iter->ts;
+	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
 	const struct trace_entry *ent = iter->ent;
 	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
@@ -1003,9 +1003,9 @@ static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
 	fill_rwbs(rwbs, t);
 
 	return trace_seq_printf(&iter->seq,
-				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, usec_rem, ent->pid, act, rwbs);
+				secs, nsec_rem, ent->pid, act, rwbs);
 }
 
 static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-- 
cgit v1.1


From 17ba97e347bec9bbc47a0877c7a098708982129d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:20:09 +0800
Subject: blktrace: fix blk_probes_ref chaos

Impact: fix mixed ioctl and ftrace-plugin blktrace use refcount bugs

ioctl-based blktrace allocates bt and registers tracepoints when
ioctl(BLKTRACESETUP), and do all cleanups when ioctl(BLKTRACETEARDOWN).

while ftrace-based blktrace allocates/frees bt when:
  # echo 1/0 > /sys/block/sda/sda1/trace/enable

and registers/unregisters tracepoints when:
  # echo blk/nop > /debugfs/tracing/current_tracer
or
  # echo 1/0 > /debugfs/tracing/tracing_enable

The separatation of allocation and registeration causes 2 problems:

  1. current user-space blktrace still calls ioctl(TEARDOWN) when
     ioctl(SETUP) failed:
       # echo 1 > /sys/block/sda/sda1/trace/enable
       # blktrace /dev/sda
         BLKTRACESETUP: Device or resource busy
         ^C
     and now blk_probes_ref == -1

  2. Another way to make blk_probes_ref == -1:
     # plugin sdb && mount sdb1
     # echo 1 > /sys/block/sdb/sdb1/trace/enable
     # remove sdb

This patch does the allocation and registeration when writing
sdaX/trace/enable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5b28f0f..8d6bd12 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -478,7 +478,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 	}
 
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
+	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
 
 	return 0;
@@ -1091,8 +1091,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		blk_register_tracepoints();
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1107,15 +1105,10 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
 {
-	if (!atomic_read(&blk_probes_ref))
-		return;
-
 	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
@@ -1254,6 +1247,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+
 	kfree(bt);
 	return 0;
 }
@@ -1280,6 +1276,9 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 		return -EBUSY;
 	}
 
+	if (atomic_inc_return(&blk_probes_ref) == 1)
+		blk_register_tracepoints();
+
 	return 0;
 }
 
-- 
cgit v1.1


From ad5dd5493a55e462796e42e50a49e76df76fdb05 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:20:24 +0800
Subject: blktrace: fix memory leak when freeing struct blk_io_trace

Impact: fix mixed ioctl and ftrace-plugin blktrace use memory leak

When mixing the use of ioctl-based blktrace and ftrace-based blktrace,
we can leak memory in this way:

  # btrace /dev/sda > /dev/null &
  # echo 0 > /sys/block/sda/sda1/trace/enable

now we leak bt->dropped_file, bt->msg_file, bt->rchan...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8d6bd12..2f21d77 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -247,7 +247,7 @@ record_it:
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 
-static void blk_trace_cleanup(struct blk_trace *bt)
+static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
@@ -255,6 +255,11 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	blk_trace_free(bt);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 }
@@ -410,11 +415,11 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		if (buts->name[i] == '/')
 			buts->name[i] = '_';
 
-	ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
-		goto err;
+		return -ENOMEM;
 
+	ret = -ENOMEM;
 	bt->sequence = alloc_percpu(unsigned long);
 	if (!bt->sequence)
 		goto err;
@@ -483,17 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	return 0;
 err:
-	if (bt) {
-		if (bt->msg_file)
-			debugfs_remove(bt->msg_file);
-		if (bt->dropped_file)
-			debugfs_remove(bt->dropped_file);
-		free_percpu(bt->sequence);
-		free_percpu(bt->msg_data);
-		if (bt->rchan)
-			relay_close(bt->rchan);
-		kfree(bt);
-	}
+	blk_trace_free(bt);
 	return ret;
 }
 
@@ -1091,6 +1086,7 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
+	blk_tracer_enabled = true;
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1098,18 +1094,17 @@ static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
-	blk_tracer_enabled = true;
 	return 0;
 }
 
 static void blk_tracer_stop(struct trace_array *tr)
 {
+	blk_tracer_enabled = false;
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
 {
-	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
 
@@ -1250,7 +1245,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 
-	kfree(bt);
+	blk_trace_free(bt);
 	return 0;
 }
 
-- 
cgit v1.1


From b6a4b0c3ad4c09c1d37b1040ac8e3ebd1016e10b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:23 +0800
Subject: blktrace: extract duplidate code

Impact: cleanup

blk_trace_event_print() and blk_tracer_print_line() share most of the code.

   text    data     bss     dec     hex filename
   8605     393      12    9010    2332 kernel/trace/blktrace.o.orig
   text    data     bss     dec     hex filename
   8555     393      12    8960    2300 kernel/trace/blktrace.o

This patch also prepares for the next patch, that prints out BLK_TN_MESSAGE.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 62 ++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2f21d77..c103b0c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -986,29 +986,31 @@ static void get_pdu_remap(const struct trace_entry *ent,
 	r->sector = be64_to_cpu(sector);
 }
 
-static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+
+static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
 	unsigned long long ts  = iter->ts;
 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
-	const struct trace_entry *ent = iter->ent;
-	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
 
 	return trace_seq_printf(&iter->seq,
 				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, nsec_rem, ent->pid, act, rwbs);
+				secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 
-static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-			      const char *act)
+static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
+	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
 	fill_rwbs(rwbs, t);
-	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+	return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
@@ -1129,22 +1131,25 @@ static const struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-					       int flags)
+static enum print_line_t print_one_line(struct trace_iterator *iter,
+					bool classic)
 {
 	struct trace_seq *s = &iter->seq;
-	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	const struct blk_io_trace *t;
+	u16 what;
 	int ret;
+	bool long_act;
+	blk_log_action_t *log_action;
 
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
+	t	   = te_blk_io_trace(iter->ent);
+	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
+	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
@@ -1152,6 +1157,15 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
+{
+	if (!trace_print_context(iter))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return print_one_line(iter, false);
+}
+
 static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1177,26 +1191,10 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
 
 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 {
-	const struct blk_io_trace *t;
-	u16 what;
-	int ret;
-
 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
 		return TRACE_TYPE_UNHANDLED;
 
-	t = (const struct blk_io_trace *)iter->ent;
-	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-
-	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(&iter->seq, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+	return print_one_line(iter, true);
 }
 
 static struct tracer blk_tracer __read_mostly = {
-- 
cgit v1.1


From 18cea4591a98817697017bcb056a848bae1205df Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:54 +0800
Subject: blktrace: print out BLK_TN_MESSAGE properly

Impact: improve ftrace plugin output

Before this patch:

 # cat trace
         make-5383  [001]   741.240059:   8,7    P   N [make]
 __trace_note_message: cfq1074

 # echo 1 > options/blk_classic
 # cat trace
   8,7    1     0.692221252     0  C   W 130411392 + 1024 [0]
 Bad pc action 6361
 Bad pc action 283d

 # echo 0 > options/blk_classic
 # echo bin > trace_options
 # cat trace_pipe | blkparse -i -
 (can't parse messages generated by blk_add_trace_msg())

After this patch:
 # cat trace
      <idle>-0     [001]   187.600933:   8,7    C   W 145220224 + 8 [0]
      <idle>-0     [001]   187.600946:   8,7    m   N cfq1076 complete

 # echo 1 > options/blk_classic
 # cat trace
   8,7    1     0.256378996   238  I   W 113190728 + 8 [pdflush]
   8,7    1     0.256378998   238  m   N cfq1076 insert_request

 # echo 0 > options/blk_classic
 # echo bin > trace_options
 # cat trace_pipe | blkparse -i -
  8,7    1        0    22.973250293     0  C   W 102770576 + 8 [0]
  8,7    1        0    22.973259213     0  m   N cfq1076 complete

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 80 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c103b0c..947c5b3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -59,22 +59,39 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 		       const void *data, size_t len)
 {
 	struct blk_io_trace *t;
+	struct ring_buffer_event *event = NULL;
+	int pc = 0;
+	int cpu = smp_processor_id();
+	bool blk_tracer = blk_tracer_enabled;
+
+	if (blk_tracer) {
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + len,
+						  0, pc);
+		if (!event)
+			return;
+		t = ring_buffer_event_data(event);
+		goto record_it;
+	}
 
 	if (!bt->rchan)
 		return;
 
 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
 	if (t) {
-		const int cpu = smp_processor_id();
-
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
+record_it:
 		t->device = bt->dev;
 		t->action = action;
 		t->pid = pid;
 		t->cpu = cpu;
 		t->pdu_len = len;
 		memcpy((void *) t + sizeof(*t), data, len);
+
+		if (blk_tracer)
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 	}
 }
 
@@ -110,14 +127,8 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
-	if (blk_tracer_enabled) {
-		va_start(args, fmt);
-		ftrace_vprintk(fmt, args);
-		va_end(args);
-		return;
-	}
-
-	if (!bt->msg_data)
+	if (unlikely(bt->trace_state != Blktrace_running &&
+		     !blk_tracer_enabled))
 		return;
 
 	local_irq_save(flags);
@@ -168,9 +179,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
+	bool blk_tracer = blk_tracer_enabled;
 
-	if (unlikely(bt->trace_state != Blktrace_running &&
-		     !blk_tracer_enabled))
+	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -185,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 	cpu = raw_smp_processor_id();
 
-	if (blk_tracer_enabled) {
+	if (blk_tracer) {
 		tracing_record_cmdline(current);
 
 		pc = preempt_count();
@@ -235,7 +246,7 @@ record_it:
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
-		if (blk_tracer_enabled) {
+		if (blk_tracer) {
 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
@@ -922,6 +933,11 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 
+	if (t->action == BLK_TN_MESSAGE) {
+		rwbs[i++] = 'N';
+		goto out;
+	}
+
 	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
 	else if (tc & BLK_TC_WRITE)
@@ -939,7 +955,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 		rwbs[i++] = 'S';
 	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
-
+out:
 	rwbs[i] = '\0';
 }
 
@@ -1074,6 +1090,17 @@ static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 				get_pdu_int(ent), cmd);
 }
 
+static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+{
+	int ret;
+	const struct blk_io_trace *t = te_blk_io_trace(ent);
+
+	ret = trace_seq_putmem(s, t + 1, t->pdu_len);
+	if (ret)
+		return trace_seq_putc(s, '\n');
+	return ret;
+}
+
 /*
  * struct tracer operations
  */
@@ -1146,6 +1173,13 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 
+	if (t->action == BLK_TN_MESSAGE) {
+		ret = log_action(iter, long_act ? "message" : "m");
+		if (ret)
+			ret = blk_log_msg(s, iter->ent);
+		goto out;
+	}
+
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
@@ -1153,7 +1187,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
-
+out:
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 
@@ -1253,11 +1287,16 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
+	int ret = -ENOMEM;
 
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
 		return -ENOMEM;
 
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+	if (!bt->msg_data)
+		goto free_bt;
+
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
 	bt->end_lba = -1ULL;
@@ -1265,14 +1304,17 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
-		kfree(bt);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto free_bt;
 	}
 
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
-
 	return 0;
+
+free_bt:
+	blk_trace_free(bt);
+	return ret;
 }
 
 /*
-- 
cgit v1.1


From bdd6df6af98ce7e70702edfb5fd5dbbd8d1b0453 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:22 +0200
Subject: tracing: provide trace_seq_reserve()

trace_seq_reserve() allows a caller to reserve space in a trace_seq and
write directly into it. This makes it easier to export binary data to
userspace via the tracing interface, by simply filling in a struct.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 13 +++++++++++++
 kernel/trace/trace_output.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19261fd..6595074 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -167,6 +167,19 @@ int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 	return trace_seq_putmem(s, hex, j);
 }
 
+void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	void *ret;
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return NULL;
+
+	ret = s->buffer + s->len;
+	s->len += len;
+
+	return ret;
+}
+
 int trace_seq_path(struct trace_seq *s, struct path *path)
 {
 	unsigned char *p;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 35c422f..0ae20b8 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,6 +33,7 @@ int trace_seq_puts(struct trace_seq *s, const char *str);
 int trace_seq_putc(struct trace_seq *s, unsigned char c);
 int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
 int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+void *trace_seq_reserve(struct trace_seq *s, size_t len);
 int trace_seq_path(struct trace_seq *s, struct path *path);
 int seq_print_userip_objs(const struct userstack_entry *entry,
 			  struct trace_seq *s, unsigned long sym_flags);
-- 
cgit v1.1


From f285901bb21355bb47106658ef14eeb6b8ed538f Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:23 +0200
Subject: tracing: add missing 'extern' keywords to trace_output.h

Impact: cleanup

Many declarations within trace_output.h are missing the 'extern' keyword
in an inconsistent manner. This adds 'extern' where it should be.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 0ae20b8..46fb961 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -29,25 +29,26 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
-int trace_seq_puts(struct trace_seq *s, const char *str);
-int trace_seq_putc(struct trace_seq *s, unsigned char c);
-int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
-int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
-void *trace_seq_reserve(struct trace_seq *s, size_t len);
-int trace_seq_path(struct trace_seq *s, struct path *path);
-int seq_print_userip_objs(const struct userstack_entry *entry,
-			  struct trace_seq *s, unsigned long sym_flags);
-int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-		      unsigned long ip, unsigned long sym_flags);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+extern int seq_print_userip_objs(const struct userstack_entry *entry,
+				 struct trace_seq *s, unsigned long sym_flags);
+extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+			     unsigned long ip, unsigned long sym_flags);
 
-int trace_print_context(struct trace_iterator *iter);
-int trace_print_lat_context(struct trace_iterator *iter);
+extern int trace_print_context(struct trace_iterator *iter);
+extern int trace_print_lat_context(struct trace_iterator *iter);
 
-struct trace_event *ftrace_find_event(int type);
-int register_ftrace_event(struct trace_event *event);
-int unregister_ftrace_event(struct trace_event *event);
+extern struct trace_event *ftrace_find_event(int type);
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
 
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
+extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
+					 int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.1


From b14b70a6a4e394c9630bcde17e07d3bcdcbca27e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:00 +0800
Subject: trace: make argument 'mem' of trace_seq_putmem() const

Impact: fix build warning

I passed a const value to trace_seq_putmem(), and I got compile warning.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 6 +++---
 kernel/trace/trace_output.h | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6595074..d72b9a6 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -137,7 +137,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 	return 1;
 }
 
-int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
 	if (len > ((PAGE_SIZE - 1) - s->len))
 		return 0;
@@ -148,10 +148,10 @@ int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 	return len;
 }
 
-int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
 {
 	unsigned char hex[HEX_CHARS];
-	unsigned char *data = mem;
+	const unsigned char *data = mem;
 	int i, j;
 
 #ifdef __BIG_ENDIAN
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 46fb961..e0bde39 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -31,8 +31,9 @@ extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern int trace_seq_puts(struct trace_seq *s, const char *str);
 extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
 extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
 extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
-- 
cgit v1.1


From a18b83b7ef3c98cd8b4bb885e4a649a8f30fb7b0 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 23 Mar 2009 10:02:53 +0530
Subject: cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when
 rcupreempt is used -v2

Impact: fix cgroups race under rcu-preempt

cpuacct_charge() obtains task's ca and does a hierarchy walk upwards.
This can race with the task's movement between cgroups. This race
can cause an access to freed ca pointer in cpuacct_charge() or access
to invalid cgroups pointer of the task. This will not happen with rcu or
tree rcu as cpuacct_charge() is called with preemption disabled. However if
rcupreempt is used, the race is seen. Thanks to Li Zefan for explaining this.

Fix this race by explicitly protecting ca and the hierarchy walk with
rcu_read_lock().

Changes for v2:

 - Update patch descrition (as per Li Zefan's review comments).

 - Remove comments in cpuacct_charge() which explained why rcu_read_lock()
   was needed (as per Peter Zijlstra's review comments).

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 186c6fd..cc397aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9654,12 +9654,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 		return;
 
 	cpu = task_cpu(tsk);
+
+	rcu_read_lock();
+
 	ca = task_ca(tsk);
 
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
+
+	rcu_read_unlock();
 }
 
 struct cgroup_subsys cpuacct_subsys = {
-- 
cgit v1.1


From 11d06b2a1e5658f448a308aa3beb97bacd64a940 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 05:45:36 -0400
Subject: Kill unsharing fs_struct in __set_personality()

That's a rudiment of altroot support.  I.e. it should've been buried
a long time ago.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exec_domain.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841..cb8e962 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -145,28 +145,6 @@ __set_personality(u_long personality)
 		return 0;
 	}
 
-	if (atomic_read(&current->fs->count) != 1) {
-		struct fs_struct *fsp, *ofsp;
-
-		fsp = copy_fs_struct(current->fs);
-		if (fsp == NULL) {
-			module_put(ep->module);
-			return -ENOMEM;
-		}
-
-		task_lock(current);
-		ofsp = current->fs;
-		current->fs = fsp;
-		task_unlock(current);
-
-		put_fs_struct(ofsp);
-	}
-
-	/*
-	 * At that point we are guaranteed to be the sole owner of
-	 * current->fs.
-	 */
-
 	current->personality = personality;
 	oep = current_thread_info()->exec_domain;
 	current_thread_info()->exec_domain = ep;
-- 
cgit v1.1


From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:00:13 -0400
Subject: Take fs_struct handling to new file (fs/fs_struct.c)

Pure code move; two new helper functions for nfsd and daemonize
(unshare_fs_struct() and daemonize_fs_struct() resp.; for now -
the same code as used to be in callers).  unshare_fs_struct()
exported (for nfsd, as copy_fs_struct()/exit_fs() used to be),
copy_fs_struct() and exit_fs() don't need exports anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 31 +------------------------------
 kernel/fork.c | 29 ++---------------------------
 2 files changed, 3 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3..ad83757 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -429,7 +429,6 @@ EXPORT_SYMBOL(disallow_signal);
 void daemonize(const char *name, ...)
 {
 	va_list args;
-	struct fs_struct *fs;
 	sigset_t blocked;
 
 	va_start(args, name);
@@ -462,11 +461,7 @@ void daemonize(const char *name, ...)
 
 	/* Become as one with the init task */
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
-
+	daemonize_fs_struct();
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -565,30 +560,6 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-void put_fs_struct(struct fs_struct *fs)
-{
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
-}
-
-void exit_fs(struct task_struct *tsk)
-{
-	struct fs_struct * fs = tsk->fs;
-
-	if (fs) {
-		task_lock(tsk);
-		tsk->fs = NULL;
-		task_unlock(tsk);
-		put_fs_struct(fs);
-	}
-}
-
-EXPORT_SYMBOL_GPL(exit_fs);
-
 #ifdef CONFIG_MM_OWNER
 /*
  * Task p is exiting and it owned mm, lets find a new owner for it
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c1584..05c02dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -681,38 +681,13 @@ fail_nomem:
 	return retval;
 }
 
-static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
-{
-	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-	/* We don't need to lock fs - think why ;-) */
-	if (fs) {
-		atomic_set(&fs->count, 1);
-		rwlock_init(&fs->lock);
-		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
-	}
-	return fs;
-}
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old)
-{
-	return __copy_fs_struct(old);
-}
-
-EXPORT_SYMBOL_GPL(copy_fs_struct);
-
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
 	if (clone_flags & CLONE_FS) {
 		atomic_inc(&current->fs->count);
 		return 0;
 	}
-	tsk->fs = __copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(current->fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1545,7 +1520,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 
 	if ((unshare_flags & CLONE_FS) &&
 	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = __copy_fs_struct(current->fs);
+		*new_fsp = copy_fs_struct(current->fs);
 		if (!*new_fsp)
 			return -ENOMEM;
 	}
-- 
cgit v1.1


From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 07:20:30 -0400
Subject: New locking/refcounting for fs_struct

* all changes of current->fs are done under task_lock and write_lock of
  old fs->lock
* refcount is not atomic anymore (same protection)
* its decrements are done when removing reference from current; at the
  same time we decide whether to free it.
* put_fs_struct() is gone
* new field - ->in_exec.  Set by check_unsafe_exec() if we are trying to do
  execve() and only subthreads share fs_struct.  Cleared when finishing exec
  (success and failure alike).  Makes CLONE_FS fail with -EAGAIN if set.
* check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread
  is in progress.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/fork.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 05c02dc..51f138a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -683,11 +683,19 @@ fail_nomem:
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
+	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
-		atomic_inc(&current->fs->count);
+		/* tsk->fs is already what we want */
+		write_lock(&fs->lock);
+		if (fs->in_exec) {
+			write_unlock(&fs->lock);
+			return -EAGAIN;
+		}
+		fs->users++;
+		write_unlock(&fs->lock);
 		return 0;
 	}
-	tsk->fs = copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1518,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 {
 	struct fs_struct *fs = current->fs;
 
-	if ((unshare_flags & CLONE_FS) &&
-	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = copy_fs_struct(current->fs);
-		if (!*new_fsp)
-			return -ENOMEM;
-	}
+	if (!(unshare_flags & CLONE_FS) || !fs)
+		return 0;
+
+	/* don't need lock here; in the worst case we'll do useless copy */
+	if (fs->users == 1)
+		return 0;
+
+	*new_fsp = copy_fs_struct(fs);
+	if (!*new_fsp)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -1639,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
+			write_lock(&fs->lock);
 			current->fs = new_fs;
-			new_fs = fs;
+			if (--fs->users)
+				new_fs = NULL;
+			else
+				new_fs = fs;
+			write_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
@@ -1679,7 +1696,7 @@ bad_unshare_cleanup_sigh:
 
 bad_unshare_cleanup_fs:
 	if (new_fs)
-		put_fs_struct(new_fs);
+		free_fs_struct(new_fs);
 
 bad_unshare_cleanup_thread:
 bad_unshare_out:
-- 
cgit v1.1


From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:50:06 -0400
Subject: Get rid of indirect include of fs_struct.h

Don't pull it in sched.h; very few files actually need it and those
can include directly.  sched.h itself only needs forward declaration
of struct fs_struct;

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c     | 1 +
 kernel/exec_domain.c | 1 +
 kernel/exit.c        | 1 +
 kernel/fork.c        | 1 +
 kernel/sys.c         | 1 +
 5 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff..2bfc647 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
 #include <linux/capability.h>
+#include <linux/fs_struct.h>
 
 #include "audit.h"
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index cb8e962..c35452c 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
+#include <linux/fs_struct.h>
 
 
 static void default_handler(int, struct pt_regs *);
diff --git a/kernel/exit.c b/kernel/exit.c
index ad83757..b5d6568 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <trace/sched.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51f138a..e82a145 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e..ce182aa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
+#include <linux/fs_struct.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
-- 
cgit v1.1


From 13b8bd0a5713bdf05659019badd7c0407984ece1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 25 Mar 2009 15:01:22 +1030
Subject: sched_rt: don't allocate cpumask in fastpath

Impact: cleanup

As pointed out by Steven Rostedt.  Since the arg in question is
unused, we simply change cpupri_find() to accept NULL.

Reported-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <200903251501.22664.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_cpupri.c |  5 +++--
 kernel/sched_rt.c     | 15 ++++-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfa..cdd3c89 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
  * cpupri_find - find the best (lowest-pri) CPU in the system
  * @cp: The cpupri context
  * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
  *
  * Note: This function returns the recommended CPUs as calculated during the
  * current invokation.  By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
 
-		cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+		if (lowest_mask)
+			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
 		return 1;
 	}
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bac1061..fbec5a5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -805,20 +805,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-	cpumask_var_t mask;
-
 	if (rq->curr->rt.nr_cpus_allowed == 1)
 		return;
 
-	if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
-		return;
-
 	if (p->rt.nr_cpus_allowed != 1
-	    && cpupri_find(&rq->rd->cpupri, p, mask))
-		goto free;
+	    && cpupri_find(&rq->rd->cpupri, p, NULL))
+		return;
 
-	if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
-		goto free;
+	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+		return;
 
 	/*
 	 * There appears to be other cpus that can accept
@@ -827,8 +822,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 */
 	requeue_task_rt(rq, p, 1);
 	resched_task(rq->curr);
-free:
-	free_cpumask_var(mask);
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.1


From 2aad1b76e6b0cc5a2e5d9b95a9f356ddddbfa8a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 30 Mar 2009 11:11:28 -0400
Subject: function-graph: allow unregistering twice

Impact: fix to permanent disabling of function graph tracer

There should be nothing to prevent a tracer from unregistering a
function graph callback more than once. This can simplify error paths.

But currently, the counter does not account for mulitple unregistering
of the function graph callback. If it happens, the function graph
tracer will be permanently disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1752a63..f1ed080 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2719,6 +2719,9 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_lock);
 
+	if (!unlikely(atomic_read(&ftrace_graph_active)))
+		goto out;
+
 	atomic_dec(&ftrace_graph_active);
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2726,6 +2729,7 @@ void unregister_ftrace_graph(void)
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 
+ out:
 	mutex_unlock(&ftrace_lock);
 }
 
-- 
cgit v1.1


From 2e572895bf3203e881356a4039ab0fa428ed2639 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 30 Mar 2009 14:03:19 -0400
Subject: ring-buffer: do not remove reader page from list on ring buffer free

Impact: prevent possible memory leak

The reader page of the ring buffer is special. Although it points
into the ring buffer, it is not part of the actual buffer. It is
a page used by the reader to swap with a page in the ring buffer.
Once the swap is made, the new reader page is again outside the
buffer.

Even though the reader page points into the buffer, it is really
pointing to residual data. Note, this data is used by the reader.

              reader page
                  |
                  v
       (prev)   +---+    (next)
     +----------|   |----------+
     |          +---+          |
     v                         v
   +---+        +---+        +---+
-->|   |------->|   |------->|   |--->
<--|   |<-------|   |<-------|   |<---
   +---+        +---+        +---+

     ^            ^            ^
      \           |            /
       ------- Buffer---------

If we perform a list_del_init() on the reader page we will actually remove
the last page the reader swapped with and not the reader page itself.
This will cause that page to not be freed, and thus is a memory leak.

Luckily, the only user of the ring buffer so far is ftrace. And ftrace
will not free its ring buffer after it allocates it. There is no current
possible memory leak. But once there are other users, or if ftrace
dynamically creates and frees its ring buffer, then this would be a
memory leak.

This patch fixes the leak for future cases.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edce2ff..960cbf4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -563,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *bpage, *tmp;
 
-	list_del_init(&cpu_buffer->reader_page->list);
 	free_buffer_page(cpu_buffer->reader_page);
 
 	list_for_each_entry_safe(bpage, tmp, head, list) {
-- 
cgit v1.1


From c5f8d99585d7b5b7e857fabf8aefd0174903a98c Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 16:56:03 +0900
Subject: posixtimers, sched: Fix posix clock monotonicity

Impact: Regression fix (against clock_gettime() backwarding bug)

This patch re-introduces a couple of functions, task_sched_runtime
and thread_group_sched_runtime, which was once removed at the
time of 2.6.28-rc1.

These functions protect the sampling of thread/process clock with
rq lock.  This rq lock is required not to update rq->clock during
the sampling.

i.e.
  The clock_gettime() may return
   ((accounted runtime before update) + (delta after update))
  that is less than what it should be.

v2 -> v3:
	- Rename static helper function __task_delta_exec()
	  to do_task_delta_exec() since -tip tree already has
	  a __task_delta_exec() of different version.

v1 -> v2:
	- Revises comments of function and patch description.
	- Add note about accuracy of thread group's runtime.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org	[2.6.28.x][2.6.29.x]
LKML-Reference: <49D1CC93.4080401@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c |  7 ++---
 kernel/sched.c            | 65 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 61 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da9..4318c30 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		cpu->cpu = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+		cpu->sched = task_sched_runtime(p);
 		break;
 	}
 	return 0;
@@ -240,18 +240,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 {
 	struct task_cputime cputime;
 
-	thread_group_cputime(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
+		thread_group_cputime(p, &cputime);
 		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
+		thread_group_cputime(p, &cputime);
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		cpu->sched = thread_group_sched_runtime(p);
 		break;
 	}
 	return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index cc397aa..c8d7f17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4139,9 +4139,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
  */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	u64 ns = 0;
+
+	if (task_current(rq, p)) {
+		update_rq_clock(rq);
+		ns = rq->clock - p->se.exec_start;
+		if ((s64)ns < 0)
+			ns = 0;
+	}
+
+	return ns;
+}
+
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
@@ -4149,16 +4165,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
 	u64 ns = 0;
 
 	rq = task_rq_lock(p, &flags);
+	ns = do_task_delta_exec(p, rq);
+	task_rq_unlock(rq, &flags);
 
-	if (task_current(rq, p)) {
-		u64 delta_exec;
+	return ns;
+}
 
-		update_rq_clock(rq);
-		delta_exec = rq->clock - p->se.exec_start;
-		if ((s64)delta_exec > 0)
-			ns = delta_exec;
-	}
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns = 0;
+
+	rq = task_rq_lock(p, &flags);
+	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+	task_rq_unlock(rq, &flags);
+
+	return ns;
+}
 
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+	struct task_cputime totals;
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+	rq = task_rq_lock(p, &flags);
+	thread_group_cputime(p, &totals);
+	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 
 	return ns;
-- 
cgit v1.1


From ef12fefabf94b6a902ad3abd3eb124b00560c445 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Tue, 31 Mar 2009 10:02:22 +0530
Subject: cpuacct: add per-cgroup utime/stime statistics

Add per-cgroup cpuacct controller statistics like the system and user
time consumed by the group of tasks.

Changelog:

v7
- Changed the name of the statistic from utime to user and from stime to
  system so that in future we could easily add other statistics like irq,
  softirq, steal times etc easily.

v6
- Fixed a bug in the error path of cpuacct_create() (pointed by Li Zefan).

v5
- In cpuacct_stats_show(), use cputime64_to_clock_t() since we are
  operating on a 64bit variable here.

v4
- Remove comments in cpuacct_update_stats() which explained why rcu_read_lock()
  was needed (as per Peter Zijlstra's review comments).
- Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt
  as per KAMEZAWA Hiroyuki's review comments.

v3
- Fix a small race in the cpuacct hierarchy walk.

v2
- stime and utime now exported in clock_t units instead of msecs.
- Addressed the code review comments from Balbir and Li Zefan.
- Moved to -tip tree.

v1
- Moved the stime/utime accounting to cpuacct controller.

Earlier versions
- http://lkml.org/lkml/2009/2/25/129

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
LKML-Reference: <20090331043222.GA4093@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 81 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c8d7f17..8d1bdbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct rq_iterator *iterator);
 #endif
 
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+	CPUACCT_STAT_USER,	/* ... user mode */
+	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */
+
+	CPUACCT_STAT_NSTATS,
+};
+
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+		enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+		enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4236,6 +4248,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+
+	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
 	/* Account for user time used */
 	acct_update_integrals(p);
 }
@@ -4297,6 +4311,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	else
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 
+	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
@@ -9539,6 +9555,7 @@ struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
+	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 	struct cpuacct *parent;
 };
 
@@ -9563,20 +9580,32 @@ static struct cgroup_subsys_state *cpuacct_create(
 	struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	int i;
 
 	if (!ca)
-		return ERR_PTR(-ENOMEM);
+		goto out;
 
 	ca->cpuusage = alloc_percpu(u64);
-	if (!ca->cpuusage) {
-		kfree(ca);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!ca->cpuusage)
+		goto out_free_ca;
+
+	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+		if (percpu_counter_init(&ca->cpustat[i], 0))
+			goto out_free_counters;
 
 	if (cgrp->parent)
 		ca->parent = cgroup_ca(cgrp->parent);
 
 	return &ca->css;
+
+out_free_counters:
+	while (--i >= 0)
+		percpu_counter_destroy(&ca->cpustat[i]);
+	free_percpu(ca->cpuusage);
+out_free_ca:
+	kfree(ca);
+out:
+	return ERR_PTR(-ENOMEM);
 }
 
 /* destroy an existing cpu accounting group */
@@ -9584,7 +9613,10 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
+	int i;
 
+	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
@@ -9671,6 +9703,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 	return 0;
 }
 
+static const char *cpuacct_stat_desc[] = {
+	[CPUACCT_STAT_USER] = "user",
+	[CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+		struct cgroup_map_cb *cb)
+{
+	struct cpuacct *ca = cgroup_ca(cgrp);
+	int i;
+
+	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+		s64 val = percpu_counter_read(&ca->cpustat[i]);
+		val = cputime64_to_clock_t(val);
+		cb->fill(cb, cpuacct_stat_desc[i], val);
+	}
+	return 0;
+}
+
 static struct cftype files[] = {
 	{
 		.name = "usage",
@@ -9681,7 +9732,10 @@ static struct cftype files[] = {
 		.name = "usage_percpu",
 		.read_seq_string = cpuacct_percpu_seq_read,
 	},
-
+	{
+		.name = "stat",
+		.read_map = cpuacct_stats_show,
+	},
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9716,6 +9770,27 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	rcu_read_unlock();
 }
 
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+		enum cpuacct_stat_index idx, cputime_t val)
+{
+	struct cpuacct *ca;
+
+	if (unlikely(!cpuacct_subsys.active))
+		return;
+
+	rcu_read_lock();
+	ca = task_ca(tsk);
+
+	do {
+		percpu_counter_add(&ca->cpustat[idx], val);
+		ca = ca->parent;
+	} while (ca);
+	rcu_read_unlock();
+}
+
 struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
-- 
cgit v1.1


From 46e0bb9c12f4bab539736f1714cbf16600f681ec Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Mon, 30 Mar 2009 10:25:20 +0530
Subject: sched: Print sched_group::__cpu_power in sched_domain_debug

Impact: extend debug info /proc/sched_debug

If the user changes the value of the sched_mc/smt_power_savings sysfs
tunable, it'll trigger a rebuilding of the whole sched_domain tree,
with the SD_POWERSAVINGS_BALANCE flag set at certain levels.

As a result, there would be a change in the __cpu_power of sched_groups
in the sched_domain hierarchy.

Print the __cpu_power values for each sched_group in sched_domain_debug
to help verify this change and correlate it with the change in the
load-balancing behavior.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090330045520.2869.24777.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8d1bdbe..6234d10 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6963,7 +6963,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-		printk(KERN_CONT " %s", str);
+		printk(KERN_CONT " %s (__cpu_power = %d)", str,
+						group->__cpu_power);
 
 		group = group->next;
 	} while (group != sd->groups);
-- 
cgit v1.1


From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:31 -0700
Subject: mm: introduce for_each_populated_zone() macro

Impact: cleanup

In almost cases, for_each_zone() is used with populated_zone().  It's
because almost function doesn't need memoryless node information.
Therefore, for_each_populated_zone() can help to make code simplify.

This patch has no functional change.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c |  9 +++------
 kernel/power/swsusp.c   | 17 ++++++++---------
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7..33e2e4a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 
 	INIT_LIST_HEAD(list);
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long zone_start, zone_end;
 		struct mem_extent *ext, *cur, *aux;
 
-		if (!populated_zone(zone))
-			continue;
-
 		zone_start = zone->zone_start_pfn;
 		zone_end = zone->zone_start_pfn + zone->spanned_pages;
 
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
 	struct zone *zone;
 	unsigned int cnt = 0;
 
-	for_each_zone(zone)
-		if (populated_zone(zone) && is_highmem(zone))
+	for_each_populated_zone(zone)
+		if (is_highmem(zone))
 			cnt += zone_page_state(zone, NR_FREE_PAGES);
 
 	return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c914..1ee66364 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -229,17 +229,16 @@ int swsusp_shrink_memory(void)
 		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
 		tmp = size;
 		size += highmem_size;
-		for_each_zone (zone)
-			if (populated_zone(zone)) {
-				tmp += snapshot_additional_pages(zone);
-				if (is_highmem(zone)) {
-					highmem_size -=
+		for_each_populated_zone(zone) {
+			tmp += snapshot_additional_pages(zone);
+			if (is_highmem(zone)) {
+				highmem_size -=
 					zone_page_state(zone, NR_FREE_PAGES);
-				} else {
-					tmp -= zone_page_state(zone, NR_FREE_PAGES);
-					tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				}
+			} else {
+				tmp -= zone_page_state(zone, NR_FREE_PAGES);
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
 			}
+		}
 
 		if (highmem_size < 0)
 			highmem_size = 0;
-- 
cgit v1.1


From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:23:18 -0700
Subject: mm: fix proc_dointvec_userhz_jiffies "breakage"

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838

On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange
way from the user's point of view:

	# echo 500 >/proc/sys/vm/dirty_writeback_centisecs
	# cat /proc/sys/vm/dirty_writeback_centisecs
	499

So, we have 5000 jiffies converted to only 499 clock ticks and reported
back.

TICK_NSEC = 999848
ACTHZ = 256039

Keeping in-kernel variable in units passed from userspace will fix issue
of course, but this probably won't be right for every sysctl.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44f..2e490a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1010,7 +1010,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_userhz_jiffies,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
-- 
cgit v1.1


From a8af78982ff4c0b3731527b0217d286a343a3089 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 31 Mar 2009 15:23:37 -0700
Subject: pm: rework includes, remove arch ifdefs

Make the following header file changes:

 - remove arch ifdefs and asm/suspend.h from linux/suspend.h
 - add asm/suspend.h to disk.c (for arch_prepare_suspend())
 - add linux/io.h to swsusp.c (for ioremap())
 - x86 32/64 bit compile fixes

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c   | 1 +
 kernel/power/swsusp.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e886d13..f3db382 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <asm/suspend.h>
 
 #include "power.h"
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1ee66364..78c3504 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/rbtree.h>
+#include <linux/io.h>
 
 #include "power.h"
 
-- 
cgit v1.1


From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: add __wake_up_locked_key() and
 __wake_up_sync_key()

This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.

The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to.  So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time.  This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.

The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.

By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.

Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).

Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled.  The former will gain some
efficiency though.

As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues.  I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.

Test program available here:

http://www.xmailserver.org/epoll_test.c

This patch:

Nothing revolutionary here.  Just using the available "key" that our
wakeup core already support.  The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().

The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both.  I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48b..73513f4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+
 /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
@@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int sync = 1;
@@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 		sync = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	__wake_up_common(q, mode, nr_exclusive, sync, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 /**
-- 
cgit v1.1


From 633fe795b80693a8198e7d82f66538a72d2bbba2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 1 Apr 2009 17:47:23 -0700
Subject: timers: add missing kernel-doc

Add missing kernel-doc parameter notation and change function
name to its new name:

  Warning(kernel/timer.c:543): No description found for parameter 'name'
  Warning(kernel/timer.c:543): No description found for parameter 'key'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm <akpm@linux-foundation.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
LKML-Reference: <20090401174723.f0bea0eb.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/timer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 9b77fc9..3af9a0b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -524,10 +524,13 @@ static void __init_timer(struct timer_list *timer)
 }
 
 /**
- * init_timer - initialize a timer.
+ * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
+ * @name: name of the timer
+ * @key: lockdep class key of the fake lock used for tracking timer
+ *       sync lock dependencies
  *
- * init_timer() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
 void init_timer(struct timer_list *timer)
-- 
cgit v1.1


From cd84a42f315e50edd454c27a3da3951ccd3d735a Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 2 Apr 2009 14:19:38 -0700
Subject: futex: comment requeue key reference semantics

We've tripped over the futex_requeue drop_count refering to key2
instead of key1.  The code is actually correct, but is non-intuitive.
This patch adds an explicit comment explaining the requeue.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a02..eef8cd2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -883,7 +883,12 @@ retry_private:
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 
-	/* drop_futex_key_refs() must be called outside the spinlocks. */
+	/*
+	 * drop_futex_key_refs() must be called outside the spinlocks. During
+	 * the requeue we moved futex_q's from the hash bucket at key1 to the
+	 * one at key2 and updated their key pointer.  We no longer need to
+	 * hold the references to key1.
+	 */
 	while (--drop_count >= 0)
 		drop_futex_key_refs(&key1);
 
-- 
cgit v1.1


From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Apr 2009 16:56:32 -0700
Subject: nommu: fix a number of issues with the per-MM VMA patch

Fix a number of issues with the per-MM VMA patch:

 (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on
     a NOMMU system with more than 2G pages.  Makes no difference on a 32-bit
     system.

 (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value,
     lest it overflow.

 (3) Move the allocation of the vm_area_struct slab back for fork.c.

 (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs.

 (5) Use BUG_ON() rather than if () BUG().

 (6) Make the default validate_nommu_regions() a static inline rather than a
     #define.

 (7) Make free_page_series()'s objection to pages with a refcount != 1 more
     informative.

 (8) Adjust the __put_nommu_region() banner comment to indicate that the
     semaphore must be held for writing.

 (9) Limit the number of warnings about munmaps of non-mmapped regions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 47c1584..51d1aa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1488,6 +1488,7 @@ void __init proc_caches_init(void)
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 }
 
-- 
cgit v1.1


From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 2 Apr 2009 16:56:59 -0700
Subject: Simplify copy_thread()

First argument unused since 2.3.11.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 51d1aa2..d7eb727 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.1


From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001
From: Grzegorz Nosek <root@localdomain.pl>
Date: Thu, 2 Apr 2009 16:57:23 -0700
Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild
 cgroups

The ns_proxy cgroup allows moving processes to child cgroups only one
level deep at a time.  This commit relaxes this restriction and makes it
possible to attach tasks directly to grandchild cgroups, e.g.:

($pid is in the root cgroup)
echo $pid > /cgroup/CG1/CG2/tasks

Previously this operation would fail with -EPERM and would have to be
performed as two steps:
echo $pid > /cgroup/CG1/tasks
echo $pid > /cgroup/CG1/CG2/tasks

Also, the target cgroup no longer needs to be empty to move a task there.

Signed-off-by: Grzegorz Nosek <root@localdomain.pl>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c    | 11 ++++++-----
 kernel/ns_cgroup.c | 14 ++++----------
 2 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7..27792bc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3084,18 +3084,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 }
 
 /**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
  * @cgrp: the cgroup in question
+ * @task: the task in question
  *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
@@ -3105,7 +3106,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
 		return 1;
 
 	get_first_subsys(cgrp, NULL, &subsys_id);
-	target = task_cgroup(current, subsys_id);
+	target = task_cgroup(task, subsys_id);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fd..5aa854f 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 
 /*
  * Rules:
- *   1. you can only enter a cgroup which is a child of your current
+ *   1. you can only enter a cgroup which is a descendant of your current
  *     cgroup
  *   2. you can only place another process into a cgroup if
  *     a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 static int ns_can_attach(struct cgroup_subsys *ss,
 		struct cgroup *new_cgroup, struct task_struct *task)
 {
-	struct cgroup *orig;
-
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		if (!cgroup_is_descendant(new_cgroup))
+		if (!cgroup_is_descendant(new_cgroup, current))
 			return -EPERM;
 	}
 
-	if (atomic_read(&new_cgroup->count) != 0)
-		return -EPERM;
-
-	orig = task_cgroup(task, ns_subsys_id);
-	if (orig && orig != new_cgroup->parent)
+	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
 	return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
-	if (!cgroup_is_descendant(cgroup))
+	if (!cgroup_is_descendant(cgroup, current))
 		return ERR_PTR(-EPERM);
 
 	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-- 
cgit v1.1


From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:25 -0700
Subject: cgroup: CSS ID support

Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code.

This patch attaches unique ID to each css and provides following.

 - css_lookup(subsys, id)
   returns pointer to struct cgroup_subysys_state of id.
 - css_get_next(subsys, id, rootid, depth, foundid)
   returns the next css under "root" by scanning

When cgroup_subsys->use_id is set, an id for css is maintained.

The cgroup framework only parepares
	- css_id of root css for subsys
	- id is automatically attached at creation of css.
	- id is *not* freed automatically. Because the cgroup framework
	  don't know lifetime of cgroup_subsys_state.
	  free_css_id() function is provided. This must be called by subsys.

There are several reasons to develop this.
	- Saving space .... For example, memcg's swap_cgroup is array of
	  pointers to cgroup. But it is not necessary to be very fast.
	  By replacing pointers(8bytes per ent) to ID (2byes per ent), we can
	  reduce much amount of memory usage.

	- Scanning without lock.
	  CSS_ID provides "scan id under this ROOT" function. By this, scanning
	  css under root can be written without locks.
	  ex)
	  do {
		rcu_read_lock();
		next = cgroup_get_next(subsys, id, root, &found);
		/* check sanity of next here */
		css_tryget();
		rcu_read_unlock();
		id = found + 1
	 } while(...)

Characteristics:
	- Each css has unique ID under subsys.
	- Lifetime of ID is controlled by subsys.
	- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
	- Allowed ID is 1-65535, ID 0 is UNUSED ID.

Design Choices:
	- scan-by-ID v.s. scan-by-tree-walk.
	  As /proc's pid scan does, scan-by-ID is robust when scanning is done
	  by following kind of routine.
	  scan -> rest a while(release a lock) -> conitunue from interrupted
	  memcg's hierarchical reclaim does this.

	- When subsys->use_id is set, # of css in the system is limited to
	  65535.

[bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 285 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27792bc..d3c5211 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
 	char release_agent_path[PATH_MAX];
 };
 
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX	(65535)
+struct css_id {
+	/*
+	 * The css to which this ID points. This pointer is set to valid value
+	 * after cgroup is populated. If cgroup is removed, this will be NULL.
+	 * This pointer is expected to be RCU-safe because destroy()
+	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
+	 * css_tryget() should be used for avoiding race.
+	 */
+	struct cgroup_subsys_state *css;
+	/*
+	 * ID of this css.
+	 */
+	unsigned short id;
+	/*
+	 * Depth in hierarchy which this ID belongs to.
+	 */
+	unsigned short depth;
+	/*
+	 * ID is freed by RCU. (and lookup routine is RCU safe.)
+	 */
+	struct rcu_head rcu_head;
+	/*
+	 * Hierarchy of CSS ID belongs to.
+	 */
+	unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
+static int alloc_css_id(struct cgroup_subsys *ss,
+			struct cgroup *parent, struct cgroup *child);
+
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
+	/* This cgroup is ready now */
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		/*
+		 * Update id->css pointer and make this css visible from
+		 * CSS ID functions. This pointer will be dereferened
+		 * from RCU-read-side without locks.
+		 */
+		if (css->id)
+			rcu_assign_pointer(css->id->css, css);
+	}
 
 	return 0;
 }
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
+	css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
+		if (ss->use_id)
+			if (alloc_css_id(ss, parent, cgrp))
+				goto err_destroy;
+		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void)
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
+		if (ss->use_id)
+			cgroup_subsys_init_idr(ss);
 	}
 
 	/* Add init_css_set to the hash table */
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str)
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * Functons for CSS ID.
+ */
+
+/*
+ *To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->id;
+	return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->depth;
+	return 0;
+}
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+		    struct cgroup_subsys_state *root)
+{
+	struct css_id *child_id = rcu_dereference(child->id);
+	struct css_id *root_id = rcu_dereference(root->id);
+
+	if (!child_id || !root_id || (child_id->depth < root_id->depth))
+		return false;
+	return child_id->stack[root_id->depth] == root_id->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+	struct css_id *id;
+
+	id = container_of(head, struct css_id, rcu_head);
+	kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+	struct css_id *id = css->id;
+	/* When this is called before css_id initialization, id can be NULL */
+	if (!id)
+		return;
+
+	BUG_ON(!ss->use_id);
+
+	rcu_assign_pointer(id->css, NULL);
+	rcu_assign_pointer(css->id, NULL);
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, id->id);
+	spin_unlock(&ss->id_lock);
+	call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+	struct css_id *newid;
+	int myid, error, size;
+
+	BUG_ON(!ss->use_id);
+
+	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+	newid = kzalloc(size, GFP_KERNEL);
+	if (!newid)
+		return ERR_PTR(-ENOMEM);
+	/* get id */
+	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+		error = -ENOMEM;
+		goto err_out;
+	}
+	spin_lock(&ss->id_lock);
+	/* Don't use 0. allocates an ID of 1-65535 */
+	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+	spin_unlock(&ss->id_lock);
+
+	/* Returns error when there are no free spaces for new ID.*/
+	if (error) {
+		error = -ENOSPC;
+		goto err_out;
+	}
+	if (myid > CSS_ID_MAX)
+		goto remove_idr;
+
+	newid->id = myid;
+	newid->depth = depth;
+	return newid;
+remove_idr:
+	error = -ENOSPC;
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, myid);
+	spin_unlock(&ss->id_lock);
+err_out:
+	kfree(newid);
+	return ERR_PTR(error);
+
+}
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+	struct css_id *newid;
+	struct cgroup_subsys_state *rootcss;
+
+	spin_lock_init(&ss->id_lock);
+	idr_init(&ss->idr);
+
+	rootcss = init_css_set.subsys[ss->subsys_id];
+	newid = get_new_cssid(ss, 0);
+	if (IS_ERR(newid))
+		return PTR_ERR(newid);
+
+	newid->stack[0] = newid->id;
+	newid->css = rootcss;
+	rootcss->id = newid;
+	return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
+			struct cgroup *child)
+{
+	int subsys_id, i, depth = 0;
+	struct cgroup_subsys_state *parent_css, *child_css;
+	struct css_id *child_id, *parent_id = NULL;
+
+	subsys_id = ss->subsys_id;
+	parent_css = parent->subsys[subsys_id];
+	child_css = child->subsys[subsys_id];
+	depth = css_depth(parent_css) + 1;
+	parent_id = parent_css->id;
+
+	child_id = get_new_cssid(ss, depth);
+	if (IS_ERR(child_id))
+		return PTR_ERR(child_id);
+
+	for (i = 0; i < depth; i++)
+		child_id->stack[i] = parent_id->stack[i];
+	child_id->stack[depth] = child_id->id;
+	/*
+	 * child_id->css pointer will be set after this cgroup is available
+	 * see cgroup_populate_dir()
+	 */
+	rcu_assign_pointer(child_css->id, child_id);
+
+	return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+	struct css_id *cssid = NULL;
+
+	BUG_ON(!ss->use_id);
+	cssid = idr_find(&ss->idr, id);
+
+	if (unlikely(!cssid))
+		return NULL;
+
+	return rcu_dereference(cssid->css);
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @root: pointer to css. search tree under this.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss, int id,
+	     struct cgroup_subsys_state *root, int *foundid)
+{
+	struct cgroup_subsys_state *ret = NULL;
+	struct css_id *tmp;
+	int tmpid;
+	int rootid = css_id(root);
+	int depth = css_depth(root);
+
+	if (!rootid)
+		return NULL;
+
+	BUG_ON(!ss->use_id);
+	/* fill start point for scan */
+	tmpid = id;
+	while (1) {
+		/*
+		 * scan next entry from bitmap(tree), tmpid is updated after
+		 * idr_get_next().
+		 */
+		spin_lock(&ss->id_lock);
+		tmp = idr_get_next(&ss->idr, &tmpid);
+		spin_unlock(&ss->id_lock);
+
+		if (!tmp)
+			break;
+		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
+			ret = rcu_dereference(tmp->css);
+			if (ret) {
+				*foundid = tmpid;
+				break;
+			}
+		}
+		/* continue to scan from next id */
+		tmpid = tmpid + 1;
+	}
+	return ret;
+}
+
-- 
cgit v1.1


From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:26 -0700
Subject: cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c5211..fc5e4a4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
+	int ret = 0;
+
 	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy)
-			ss->pre_destroy(ss, cgrp);
-	return;
+		if (ss->pre_destroy) {
+			ret = ss->pre_destroy(ss, cgrp);
+			if (ret)
+				break;
+		}
+	return ret;
 }
 
 static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	remove_dir(dentry);
 }
 
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+		wake_up_all(&cgroup_rmdir_waitq);
+}
+
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
 	put_css_set(cg);
+
+	/*
+	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
+	 * is no longer empty.
+	 */
+	cgroup_wakeup_rmdir_waiters(cgrp);
 	return 0;
 }
 
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
+	DEFINE_WAIT(wait);
+	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
-
+again:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
-	cgroup_call_pre_destroy(cgrp);
+	ret = cgroup_call_pre_destroy(cgrp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
-
-	if (atomic_read(&cgrp->count)
-	    || !list_empty(&cgrp->children)
-	    || !cgroup_clear_css_refs(cgrp)) {
+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
+	/*
+	 * css_put/get is provided for subsys to grab refcnt to css. In typical
+	 * case, subsystem has no reference after pre_destroy(). But, under
+	 * hierarchy management, some *temporal* refcnt can be hold.
+	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+	 * is called when css_put() is called and refcnt goes down to 0.
+	 */
+	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+	if (!cgroup_clear_css_refs(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		schedule();
+		finish_wait(&cgroup_rmdir_waitq, &wait);
+		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+		if (signal_pending(current))
+			return -EINTR;
+		goto again;
+	}
+	/* NO css_tryget() can success after here. */
+	finish_wait(&cgroup_rmdir_waitq, &wait);
+	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if ((atomic_dec_return(&css->refcnt) == 1) &&
-	    notify_on_release(cgrp)) {
-		set_bit(CGRP_RELEASABLE, &cgrp->flags);
-		check_for_release(cgrp);
+	if (atomic_dec_return(&css->refcnt) == 1) {
+		if (notify_on_release(cgrp)) {
+			set_bit(CGRP_RELEASABLE, &cgrp->flags);
+			check_for_release(cgrp);
+		}
+		cgroup_wakeup_rmdir_waiters(cgrp);
 	}
 	rcu_read_unlock();
 }
-- 
cgit v1.1


From 66bdc9cfc77ba89a9ee6c82d28375b646ab4bb1d Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 2 Apr 2009 16:57:27 -0700
Subject: kernel/cgroup.c: kfree(NULL) is legal

Reduces object file size a bit:

Before:
$ size kernel/cgroup.o
   text    data     bss     dec     hex filename
  21593    7804    4924   34321    8611 kernel/cgroup.o
After:
$ size kernel/cgroup.o
   text    data     bss     dec     hex filename
  21537    7744    4924   34205    859d kernel/cgroup.o

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fc5e4a4..9a6c2bf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -923,8 +923,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
-	if (opts.release_agent)
-		kfree(opts.release_agent);
+	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
@@ -1027,15 +1026,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret) {
-		if (opts.release_agent)
-			kfree(opts.release_agent);
+		kfree(opts.release_agent);
 		return ret;
 	}
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
-		if (opts.release_agent)
-			kfree(opts.release_agent);
+		kfree(opts.release_agent);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.1


From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:29 -0700
Subject: cgroups: show correct file mode

We have some read-only files and write-only files, but currently they are
all set to 0644, which is counter-intuitive and cause trouble for some
cgroup tools like libcgroup.

This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's
own files' file mode, and for the most cases cft->mode can be default to 0
and cgroup will figure out proper mode.

Acked-by: Paul Menage <menage@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 38 ++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c |  1 +
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a6c2bf..fea11c5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,7 +1686,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
@@ -1732,7 +1732,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-				int mode)
+				mode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
@@ -1750,6 +1750,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	return error;
 }
 
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+	mode_t mode = 0;
+
+	if (cft->mode)
+		return cft->mode;
+
+	if (cft->read || cft->read_u64 || cft->read_s64 ||
+	    cft->read_map || cft->read_seq_string)
+		mode |= S_IRUGO;
+
+	if (cft->write || cft->write_u64 || cft->write_s64 ||
+	    cft->write_string || cft->trigger)
+		mode |= S_IWUSR;
+
+	return mode;
+}
+
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
@@ -1757,6 +1784,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
+	mode_t mode;
 
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1767,7 +1795,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
-		error = cgroup_create_file(dentry, 0644 | S_IFREG,
+		mode = cgroup_file_mode(cft);
+		error = cgroup_create_file(dentry, mode | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
@@ -2349,6 +2378,7 @@ static struct cftype files[] = {
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
+		.mode = S_IRUGO | S_IWUSR,
 	},
 
 	{
@@ -2449,7 +2479,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     int mode)
+			     mode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9d..ee5ec38 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1706,6 +1706,7 @@ static struct cftype files[] = {
 		.read_u64 = cpuset_read_u64,
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE,
+		.mode = S_IRUGO,
 	},
 
 	{
-- 
cgit v1.1


From 0670e08bdfc67272f8c3087030417465629b8073 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:30 -0700
Subject: cgroups: don't change release_agent when remount failed

Remount can fail in either case:
  - wrong mount options is specified, or option 'noprefix' is changed.
  - a to-be-added subsys is already mounted/active.

When using remount to change 'release_agent', for the above former failure
case, remount will return errno with release_agent unchanged, but for the
latter case, remount will return EBUSY with relase_agent changed, which is
unexpected I think:

 # mount -t cgroup -o cpu xxx /cgrp1
 # mount -t cgroup -o cpuset,release_agent=agent1 yyy /cgrp2
 # cat /cgrp2/release_agent
 agent1
 # mount -t cgroup -o remount,cpuset,noprefix,release_agent=agent2 yyy /cgrp2
 mount: /cgrp2 not mounted already, or bad option
 # cat /cgrp2/release_agent
 agent1     <-- ok
 # mount -t cgroup -o remount,cpu,cpuset,release_agent=agent2 yyy /cgrp2
 mount: /cgrp2 is busy
 # cat /cgrp2/release_agent
 agent2     <-- unexpected!

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fea11c5..f2a3f5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -915,10 +915,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ret = rebind_subsystems(root, opts.subsys_bits);
+	if (ret)
+		goto out_unlock;
 
 	/* (re)populate subsystem files */
-	if (!ret)
-		cgroup_populate_dir(cgrp);
+	cgroup_populate_dir(cgrp);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
-- 
cgit v1.1


From d969fbe69e07fcceb0558b35d4c75eb046041c5e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:31 -0700
Subject: debug cgroup: remove unneeded cgroup_lock

Since we are in cgroup write handler, so the cgrp is valid, so we don't
have to hold cgroup_mutex when calling cgroup_task_count().  One similar
example is in cgroup_tasks_open().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup_debug.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca620..0c92d79 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	u64 count;
 
-	cgroup_lock();
 	count = cgroup_task_count(cont);
-	cgroup_unlock();
 	return count;
 }
 
-- 
cgit v1.1


From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:38 -0700
Subject: memcg: fix OOM killer under memcg

This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.

But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup

	/groupA/	hierarchy=1, limit=1G,
		01	nolimit
		02	nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.

This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.

To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f2a3f5c..382109b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 
 bool css_is_ancestor(struct cgroup_subsys_state *child,
-		    struct cgroup_subsys_state *root)
+		    const struct cgroup_subsys_state *root)
 {
 	struct css_id *child_id = rcu_dereference(child->id);
 	struct css_id *root_id = rcu_dereference(root->id);
-- 
cgit v1.1


From 0b4217b3fdddc4a58939720d3ed809537577d48b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:49 -0700
Subject: cpuset: fix possible races in cpu/memory hotplug

Change to cpuset->cpus_allowed and cpuset->mems_allowed should be protected
by callback_mutex, otherwise the reader may read wrong cpus/mems. This is
cpuset's lock rule.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee5ec38..3173795 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2070,7 +2070,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 	}
 
 	cgroup_lock();
+	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+	mutex_unlock(&callback_mutex);
 	scan_for_empty_cpusets(&top_cpuset);
 	ndoms = generate_sched_domains(&doms, &attr);
 	cgroup_unlock();
@@ -2093,11 +2095,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		break;
 	case MEM_OFFLINE:
+		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		scan_for_empty_cpusets(&top_cpuset);
+		mutex_unlock(&callback_mutex);
+		if (action == MEM_OFFLINE)
+			scan_for_empty_cpusets(&top_cpuset);
 		break;
 	default:
 		break;
-- 
cgit v1.1


From 3b6766fe668b83c8a03c6ed01bcc2ac77cbae848 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:51 -0700
Subject: cpuset: rewrite update_tasks_nodemask()

This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's
mems_allowed.

Not only simplify the code largely, but also avoid allocating an array to
hold mm pointers of all the tasks in the cpuset.  This array can be big
(size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a
chance to fail the allocation when under memory stress.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 109 ++++++++++++++++++++------------------------------------
 1 file changed, 39 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3173795..dca455e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	mutex_unlock(&callback_mutex);
 }
 
+/*
+ * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ */
+static void cpuset_change_nodemask(struct task_struct *p,
+				   struct cgroup_scanner *scan)
+{
+	struct mm_struct *mm;
+	struct cpuset *cs;
+	int migrate;
+	const nodemask_t *oldmem = scan->data;
+
+	mm = get_task_mm(p);
+	if (!mm)
+		return;
+
+	cs = cgroup_cs(scan->cg);
+	migrate = is_memory_migrate(cs);
+
+	mpol_rebind_mm(mm, &cs->mems_allowed);
+	if (migrate)
+		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+	mmput(mm);
+}
+
 static void *cpuset_being_rebound;
 
 /**
@@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
  */
 static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-	struct task_struct *p;
-	struct mm_struct **mmarray;
-	int i, n, ntasks;
-	int migrate;
-	int fudge;
-	struct cgroup_iter it;
 	int retval;
+	struct cgroup_scanner scan;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
-	fudge = 10;				/* spare mmarray[] slots */
-	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
-	retval = -ENOMEM;
-
-	/*
-	 * Allocate mmarray[] to hold mm reference for each task
-	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
-	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
-	 * few more lines of code, we can retry until we get a big
-	 * enough mmarray[] w/o using GFP_ATOMIC.
-	 */
-	while (1) {
-		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
-		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
-		if (!mmarray)
-			goto done;
-		read_lock(&tasklist_lock);		/* block fork */
-		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
-			break;				/* got enough */
-		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
-	}
-
-	n = 0;
-
-	/* Load up mmarray[] with mm reference for each task in cpuset. */
-	cgroup_iter_start(cs->css.cgroup, &it);
-	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
-		struct mm_struct *mm;
-
-		if (n >= ntasks) {
-			printk(KERN_WARNING
-				"Cpuset mempolicy rebind incomplete.\n");
-			break;
-		}
-		mm = get_task_mm(p);
-		if (!mm)
-			continue;
-		mmarray[n++] = mm;
-	}
-	cgroup_iter_end(cs->css.cgroup, &it);
-	read_unlock(&tasklist_lock);
+	scan.cg = cs->css.cgroup;
+	scan.test_task = NULL;
+	scan.process_task = cpuset_change_nodemask;
+	scan.heap = NULL;
+	scan.data = (nodemask_t *)oldmem;
 
 	/*
-	 * Now that we've dropped the tasklist spinlock, we can
-	 * rebind the vma mempolicies of each mm in mmarray[] to their
-	 * new cpuset, and release that mm.  The mpol_rebind_mm()
-	 * call takes mmap_sem, which we couldn't take while holding
-	 * tasklist_lock.  Forks can happen again now - the mpol_dup()
-	 * cpuset_being_rebound check will catch such forks, and rebind
-	 * their vma mempolicies too.  Because we still hold the global
-	 * cgroup_mutex, we know that no other rebind effort will
-	 * be contending for the global variable cpuset_being_rebound.
+	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+	 * take while holding tasklist_lock.  Forks can happen - the
+	 * mpol_dup() cpuset_being_rebound check will catch such forks,
+	 * and rebind their vma mempolicies too.  Because we still hold
+	 * the global cgroup_mutex, we know that no other rebind effort
+	 * will be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	migrate = is_memory_migrate(cs);
-	for (i = 0; i < n; i++) {
-		struct mm_struct *mm = mmarray[i];
-
-		mpol_rebind_mm(mm, &cs->mems_allowed);
-		if (migrate)
-			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
-		mmput(mm);
-	}
+	retval = cgroup_scan_tasks(&scan);
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	kfree(mmarray);
 	cpuset_being_rebound = NULL;
-	retval = 0;
-done:
+
 	return retval;
 }
 
-- 
cgit v1.1


From 010cfac4ca0f9e85f54ba2117a372e72f4fb9a60 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:52 -0700
Subject: cpuset: avoid changing cpuset's mems when errno returned

When writing to cpuset.mems, cpuset has to update its mems_allowed before
calling update_tasks_nodemask(), but this function might return -ENOMEM.

To avoid this rare case, we allocate the memory before changing
mems_allowed, and then pass to update_tasks_nodemask().  Similar to what
update_cpumask() does.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index dca455e..3778a21 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1057,13 +1057,15 @@ static void *cpuset_being_rebound;
  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
  * @oldmem: old mems_allowed of cpuset cs
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
  * Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
  */
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+				 struct ptr_heap *heap)
 {
-	int retval;
 	struct cgroup_scanner scan;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
@@ -1071,7 +1073,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	scan.cg = cs->css.cgroup;
 	scan.test_task = NULL;
 	scan.process_task = cpuset_change_nodemask;
-	scan.heap = NULL;
+	scan.heap = heap;
 	scan.data = (nodemask_t *)oldmem;
 
 	/*
@@ -1084,12 +1086,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	retval = cgroup_scan_tasks(&scan);
+	cgroup_scan_tasks(&scan);
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
 	cpuset_being_rebound = NULL;
-
-	return retval;
 }
 
 /*
@@ -1110,6 +1110,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 {
 	nodemask_t oldmem;
 	int retval;
+	struct ptr_heap heap;
 
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1144,12 +1145,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto done;
 
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+	if (retval < 0)
+		goto done;
+
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
 	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
-	retval = update_tasks_nodemask(cs, &oldmem);
+	update_tasks_nodemask(cs, &oldmem, &heap);
+
+	heap_free(&heap);
 done:
 	return retval;
 }
@@ -2003,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, &oldmems);
+			update_tasks_nodemask(cp, &oldmems, NULL);
 		}
 	}
 }
-- 
cgit v1.1


From 7f81b1ae18416b457e4d5ff23f0bd598e8a42224 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:53 -0700
Subject: cpuset: remove struct cpuset_hotplug_scanner

Use cgroup_scanner.data, instead of introducing cpuset_hotplug_scanner.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3778a21..0619f10 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
 	return container_of(task_subsys_state(task, cpuset_subsys_id),
 			    struct cpuset, css);
 }
-struct cpuset_hotplug_scanner {
-	struct cgroup_scanner scan;
-	struct cgroup *to;
-};
 
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -1890,10 +1886,9 @@ int __init cpuset_init(void)
 static void cpuset_do_move_task(struct task_struct *tsk,
 				struct cgroup_scanner *scan)
 {
-	struct cpuset_hotplug_scanner *chsp;
+	struct cgroup *new_cgroup = scan->data;
 
-	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
-	cgroup_attach_task(chsp->to, tsk);
+	cgroup_attach_task(new_cgroup, tsk);
 }
 
 /**
@@ -1909,15 +1904,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
  */
 static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 {
-	struct cpuset_hotplug_scanner scan;
+	struct cgroup_scanner scan;
 
-	scan.scan.cg = from->css.cgroup;
-	scan.scan.test_task = NULL; /* select all tasks in cgroup */
-	scan.scan.process_task = cpuset_do_move_task;
-	scan.scan.heap = NULL;
-	scan.to = to->css.cgroup;
+	scan.cg = from->css.cgroup;
+	scan.test_task = NULL; /* select all tasks in cgroup */
+	scan.process_task = cpuset_do_move_task;
+	scan.heap = NULL;
+	scan.data = to->css.cgroup;
 
-	if (cgroup_scan_tasks(&scan.scan))
+	if (cgroup_scan_tasks(&scan))
 		printk(KERN_ERR "move_member_tasks_to_cpuset: "
 				"cgroup_scan_tasks failed\n");
 }
-- 
cgit v1.1


From a1bc5a4eee990a1f290735c8694d0aebdad095fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:54 -0700
Subject: cpusets: replace zone allowed functions with node allowed

The cpuset_zone_allowed() variants are actually only a function of the
zone's node.

Cc: Paul Menage <menage@google.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 59 ++++++++++++++++++++++++---------------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0619f10..3ff910e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2181,26 +2181,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 }
 
 /**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.  If
- * __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.  If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
  *
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2239,20 +2237,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
- *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
  *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  *    the code that might scan up ancestor cpusets and sleep.
  */
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
@@ -2281,15 +2276,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 }
 
 /*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes.  Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2297,20 +2292,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  * any node on the zonelist except the first.  By the time any such
  * calls get to this routine, we should just shut up and say 'yes'.
  *
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
  * mems_allowed or that we're in interrupt.  It does not scan up the
  * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
  * It never sleeps.
  */
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
-
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	/*
-- 
cgit v1.1


From db7f47cf4805e30decb0841764b21b7c4000f7dc Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 2 Apr 2009 16:57:55 -0700
Subject: cpusets: allow cpusets to be configured/built on non-SMP systems

Allow cpusets to be configured/built on non-SMP systems

Currently it's impossible to build cpusets under UML on x86-64, since
cpusets depends on SMP and x86-64 UML doesn't support SMP.

There's code in cpusets that doesn't depend on SMP.  This patch surrounds
the minimum amount of cpusets code with #ifdef CONFIG_SMP in order to
allow cpusets to build/run on UP systems (for testing purposes under UML).

Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3ff910e..2b93b50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -517,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	return 0;
 }
 
+#ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
  * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -811,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 
 	put_online_cpus();
 }
+#else /* !CONFIG_SMP */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+}
+
+static int generate_sched_domains(struct cpumask **domains,
+			struct sched_domain_attr **attributes)
+{
+	*domains = NULL;
+	return 1;
+}
+#endif /* CONFIG_SMP */
 
 static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
 
@@ -1164,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
 
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
+#ifdef CONFIG_SMP
 	if (val < -1 || val >= SD_LV_MAX)
 		return -EINVAL;
+#endif
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
-- 
cgit v1.1


From 6d7b2f5f9e88902b19f91d0c8a7ef58a5455f1a2 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:57 -0700
Subject: cpusets: prevent PF_THREAD_BOUND tasks from attaching to non-root
 cpusets

Kthreads that have the PF_THREAD_BOUND bit set in their flags are bound to a
specific cpu.  Thus, their set of allowed cpus shall not change.

This patch prevents such threads from attaching to non-root cpusets.  They do
not have mempolicies that restrict them to a subset of system nodes and, since
their cpumask may never change, they cannot use any of the features of
cpusets.

The tasks will forever be a member of the root cpuset and will be returned
when listing the tasks attached to that cpuset.

Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2b93b50..026facc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1342,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 			     struct cgroup *cont, struct task_struct *tsk)
 {
 	struct cpuset *cs = cgroup_cs(cont);
-	int ret = 0;
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
 
-	if (tsk->flags & PF_THREAD_BOUND) {
-		mutex_lock(&callback_mutex);
-		if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
-			ret = -EINVAL;
-		mutex_unlock(&callback_mutex);
-	}
+	/*
+	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
+	 * cannot change their cpu affinity and isolating such threads by their
+	 * set of allowed nodes is unnecessary.  Thus, cpusets are not
+	 * applicable for such threads.  This prevents checking for success of
+	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+	 * be changed.
+	 */
+	if (tsk->flags & PF_THREAD_BOUND)
+		return -EINVAL;
 
-	return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
+	return security_task_setscheduler(tsk, 0, NULL);
 }
 
 static void cpuset_attach(struct cgroup_subsys *ss,
-- 
cgit v1.1


From 90bc8d8b1a38f1ab131a2399a202e1889db95de8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:57:58 -0700
Subject: do_wait: fix waiting for the group stop with the dead leader

do_wait(WSTOPPED) assumes that p->state must be == TASK_STOPPED, this is
not true if the leader is already dead.  Check SIGNAL_STOP_STOPPED instead
and use signal->group_exit_code.

Trivial test-case:

	void *tfunc(void *arg)
	{
		pause();
		return NULL;
	}

	int main(void)
	{
		pthread_t thr;
		pthread_create(&thr, NULL, tfunc, NULL);
		pthread_exit(NULL);
		return 0;
	}

It doesn't react to ^Z (and then to ^C or ^\). The task is stopped, but
bash can't see this.

The bug is very old, and it was reported multiple times. This patch was sent
more than a year ago (http://marc.info/?t=119713920000003) but it was ignored.

This change also fixes other oddities (but not all) in this area.  For
example, before this patch:

	$ sleep 100
	^Z
	[1]+  Stopped                 sleep 100
	$ strace -p `pidof sleep`
	Process 11442 attached - interrupt to quit

strace hangs in do_wait(), because ->exit_code was already consumed by
bash.  After this patch, strace happily proceeds:

	--- SIGTSTP (Stopped) @ 0 (0) ---
	restart_syscall(<... resuming interrupted call ...>

To me, this looks much more "natural" and correct.

Another example.  Let's suppose we have the main thread M and sub-thread
T, the process is stopped, and its parent did wait(WSTOPPED).  Now we can
ptrace T but not M.  This looks at least strange to me.

Imho, do_wait() should not confuse the per-thread ptrace stops with the
per-process job control stops.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Kaz Kylheku <kkylheku@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3..0c06b9e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1417,6 +1417,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	return retval;
 }
 
+static int *task_stopped_code(struct task_struct *p, bool ptrace)
+{
+	if (ptrace) {
+		if (task_is_stopped_or_traced(p))
+			return &p->exit_code;
+	} else {
+		if (p->signal->flags & SIGNAL_STOP_STOPPED)
+			return &p->signal->group_exit_code;
+	}
+	return NULL;
+}
+
 /*
  * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
@@ -1427,7 +1439,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
-	int retval, exit_code, why;
+	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
@@ -1437,22 +1449,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
-	if (unlikely(!task_is_stopped_or_traced(p)))
-		goto unlock_sig;
-
-	if (!ptrace && p->signal->group_stop_count > 0)
-		/*
-		 * A group stop is in progress and this is the group leader.
-		 * We won't report until all threads have stopped.
-		 */
+	p_code = task_stopped_code(p, ptrace);
+	if (unlikely(!p_code))
 		goto unlock_sig;
 
-	exit_code = p->exit_code;
+	exit_code = *p_code;
 	if (!exit_code)
 		goto unlock_sig;
 
 	if (!unlikely(options & WNOWAIT))
-		p->exit_code = 0;
+		*p_code = 0;
 
 	/* don't need the RCU readlock here as we're holding a spinlock */
 	uid = __task_cred(p)->uid;
@@ -1608,7 +1614,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 	 */
 	*notask_error = 0;
 
-	if (task_is_stopped_or_traced(p))
+	if (task_stopped_code(p, ptrace))
 		return wait_task_stopped(ptrace, p, options,
 					 infop, stat_addr, ru);
 
-- 
cgit v1.1


From 43918f2bf4806675943416d539d9d5e4d585ebff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:00 -0700
Subject: signals: remove 'handler' parameter to tracehook functions

Container-init must behave like global-init to processes within the
container and hence it must be immune to unhandled fatal signals from
within the container (i.e SIG_DFL signals that terminate the process).

But the same container-init must behave like a normal process to processes
in ancestor namespaces and so if it receives the same fatal signal from a
process in ancestor namespace, the signal must be processed.

Implementing these semantics requires that send_signal() determine pid
namespace of the sender but since signals can originate from workqueues/
interrupt-handlers, determining pid namespace of sender may not always be
possible or safe.

This patchset implements the design/simplified semantics suggested by
Oleg Nesterov.  The simplified semantics for container-init are:

	- container-init must never be terminated by a signal from a
	  descendant process.

	- container-init must never be immune to SIGKILL from an ancestor
	  namespace (so a process in parent namespace must always be able
	  to terminate a descendant container).

	- container-init may be immune to unhandled fatal signals (like
	  SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP
	  are the only reliable signals to a container-init from ancestor
	  namespace.

This patch:

Based on an earlier patch submitted by Oleg Nesterov and comments from
Roland McGrath (http://lkml.org/lkml/2008/11/19/258).

The handler parameter is currently unused in the tracehook functions.
Besides, the tracehook functions are called with siglock held, so the
functions can check the handler if they later need to.

Removing the parameter simiplifies changes to sig_ignored() in a follow-on
patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1c88144..92a1ab0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -74,7 +74,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig, handler);
+	return !tracehook_consider_ignored_signal(t, sig);
 }
 
 /*
@@ -318,7 +318,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig, handler);
+	return !tracehook_consider_fatal_signal(tsk, sig);
 }
 
 
@@ -777,7 +777,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+	     !tracehook_consider_fatal_signal(t, sig))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.1


From f008faff0e2777c8b3fe853891b774ca465938d8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:02 -0700
Subject: signals: protect init from unwanted signals more

(This is a modified version of the patch submitted by Oleg Nesterov
http://lkml.org/lkml/2008/11/18/249 and tries to address comments that
came up in that discussion)

init ignores the SIG_DFL signals but we queue them anyway, including
SIGKILL.  This is mostly OK, the signal will be dropped silently when
dequeued, but the pending SIGKILL has 2 bad implications:

        - it implies fatal_signal_pending(), so we confuse things
          like wait_for_completion_killable/lock_page_killable.

        - for the sub-namespace inits, the pending SIGKILL can
          mask (legacy_queue) the subsequent SIGKILL from the
          parent namespace which must kill cinit reliably.
          (preparation, cinits don't have SIGNAL_UNKILLABLE yet)

The patch can't help when init is ptraced, but ptracing of init is not
"safe" anyway.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 92a1ab0..8bf7a40 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig)
 {
 	void __user *handler;
 
+	handler = sig_handler(t, sig);
+
+	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
+			handler == SIG_DFL)
+		return 1;
+
+	return sig_handler_ignored(handler, sig);
+}
+
+static int sig_ignored(struct task_struct *t, int sig)
+{
 	/*
 	 * Blocked signals are never ignored, since the
 	 * signal handler may change by the time it is
@@ -67,8 +78,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	handler = sig_handler(t, sig);
-	if (!sig_handler_ignored(handler, sig))
+	if (!sig_task_ignored(t, sig))
 		return 0;
 
 	/*
-- 
cgit v1.1


From 7978b567d31555fc828b8f945c605ad29e117b22 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:04 -0700
Subject: signals: add from_ancestor_ns parameter to send_signal()

send_signal() (or its helper) needs to determine the pid namespace of the
sender.  But a signal sent via kill_pid_info_as_uid() comes from within
the kernel and send_signal() does not need to determine the pid namespace
of the sender.  So define a helper for send_signal() which takes an
additional parameter, 'from_ancestor_ns' and have kill_pid_info_as_uid()
use that helper directly.

The 'from_ancestor_ns' parameter will be used in a follow-on patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8bf7a40..7b6de96 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -823,8 +823,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
-static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			int group)
+static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			int group, int from_ancestor_ns)
 {
 	struct sigpending *pending;
 	struct sigqueue *q;
@@ -899,6 +899,12 @@ out_set:
 	return 0;
 }
 
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			int group)
+{
+	return __send_signal(sig, info, t, group, 0);
+}
+
 int print_fatal_signals;
 
 static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1143,7 +1149,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 	if (sig && p->sighand) {
 		unsigned long flags;
 		spin_lock_irqsave(&p->sighand->siglock, flags);
-		ret = __group_send_sig_info(sig, info, p);
+		ret = __send_signal(sig, info, p, 1, 0);
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 out_unlock:
-- 
cgit v1.1


From 921cf9f63089c7442d44083477620132f4cea066 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:05 -0700
Subject: signals: protect cinit from unblocked SIG_DFL signals

Drop early any SIG_DFL or SIG_IGN signals to container-init from within
the same container.  But queue SIGSTOP and SIGKILL to the container-init
if they are from an ancestor container.

Blocked, fatal signals (i.e when SIG_DFL is to terminate) from within the
container can still terminate the container-init.  That will be addressed
in the next patch.

Note:	To be bisect-safe, SIGNAL_UNKILLABLE will be set for container-inits
   	in a follow-on patch. Until then, this patch is just a preparatory
	step.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 7b6de96..fb19aae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,20 +55,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_task_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig,
+		int from_ancestor_ns)
 {
 	void __user *handler;
 
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL)
+			handler == SIG_DFL && !from_ancestor_ns)
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
 }
 
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 {
 	/*
 	 * Blocked signals are never ignored, since the
@@ -78,7 +79,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig))
+	if (!sig_task_ignored(t, sig, from_ancestor_ns))
 		return 0;
 
 	/*
@@ -634,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
  * Returns true if the signal should be actually delivered, otherwise
  * it should be dropped.
  */
-static int prepare_signal(int sig, struct task_struct *p)
+static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -718,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
 		}
 	}
 
-	return !sig_ignored(p, sig);
+	return !sig_ignored(p, sig, from_ancestor_ns);
 }
 
 /*
@@ -832,7 +833,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	trace_sched_signal_send(sig, t);
 
 	assert_spin_locked(&t->sighand->siglock);
-	if (!prepare_signal(sig, t))
+
+	if (!prepare_signal(sig, t, from_ancestor_ns))
 		return 0;
 
 	pending = group ? &t->signal->shared_pending : &t->pending;
@@ -902,7 +904,15 @@ out_set:
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group)
 {
-	return __send_signal(sig, info, t, group, 0);
+	int from_ancestor_ns = 0;
+
+#ifdef CONFIG_PID_NS
+	if (!is_si_special(info) && SI_FROMUSER(info) &&
+			task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
+		from_ancestor_ns = 1;
+#endif
+
+	return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
 
 int print_fatal_signals;
@@ -1336,7 +1346,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 		goto ret;
 
 	ret = 1; /* the signal is ignored */
-	if (!prepare_signal(sig, t))
+	if (!prepare_signal(sig, t, 0))
 		goto out;
 
 	ret = 0;
-- 
cgit v1.1


From e4da026f980df125a4918c3bb9fe93185c7ef12a Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:06 -0700
Subject: signals: zap_pid_ns_process() should use force_sig()

send_signal() assumes that signals with SEND_SIG_PRIV are generated from
within the same namespace.  So any nested container-init processes become
immune to the SIGKILL generated by kill_proc_info() in
zap_pid_ns_processes().

Use force_sig() in zap_pid_ns_processes() instead - force_sig() clears the
SIGNAL_UNKILLABLE flag ensuring the signal is processed by
container-inits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea8..2d1001b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
 	int nr;
 	int rc;
+	struct task_struct *task;
 
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	read_lock(&tasklist_lock);
 	nr = next_pidmap(pid_ns, 1);
 	while (nr > 0) {
-		kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+		rcu_read_lock();
+
+		/*
+		 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+		 * any nested-container's init processes don't ignore the
+		 * signal
+		 */
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			force_sig(SIGKILL, task);
+
+		rcu_read_unlock();
+
 		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
-- 
cgit v1.1


From b3bfa0cba867f23365b81658b47efd906830879b Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:08 -0700
Subject: signals: protect cinit from blocked fatal signals

Normally SIG_DFL signals to global and container-init are dropped early.
But if a signal is blocked when it is posted, we cannot drop the signal
since the receiver may install a handler before unblocking the signal.
Once this signal is queued however, the receiver container-init has no way
of knowing if the signal was sent from an ancestor or descendant
namespace.  This patch ensures that contianer-init drops all SIG_DFL
signals in get_signal_to_deliver() except SIGKILL/SIGSTOP.

If SIGSTOP/SIGKILL originate from a descendant of container-init they are
never queued (i.e dropped in sig_ignored() in an earler patch).

If SIGSTOP/SIGKILL originate from parent namespace, the signal is queued
and container-init processes the signal.

IOW, if get_signal_to_deliver() sees a sig_kernel_only() signal for global
or container-init, the signal must have been generated internally or must
have come from an ancestor ns and we process the signal.

Further, the signal_group_exit() check was needed to cover the case of a
multi-threaded init sending SIGKILL to other threads when doing an exit()
or exec().  But since the new sig_kernel_only() check covers the SIGKILL,
the signal_group_exit() check is no longer needed and can be removed.

Finally, now that we have all pieces in place, set SIGNAL_UNKILLABLE for
container-inits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c   | 2 ++
 kernel/signal.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d7eb727..adbea16 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->flags = 0;
+	if (clone_flags & CLONE_NEWPID)
+		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->group_exit_code = 0;
 	sig->group_exit_task = NULL;
 	sig->group_stop_count = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index fb19aae..ba3da25 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1870,9 +1870,16 @@ relock:
 
 		/*
 		 * Global init gets no signals it doesn't want.
+		 * Container-init gets no signals it doesn't want from same
+		 * container.
+		 *
+		 * Note that if global/container-init sees a sig_kernel_only()
+		 * signal here, the signal must have been generated internally
+		 * or must have come from an ancestor namespace. In either
+		 * case, the signal cannot be dropped.
 		 */
 		if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
-		    !signal_group_exit(signal))
+				!sig_kernel_only(signr))
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.1


From 6588c1e3ff01418acafd938db0740e3477dc8cb7 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:09 -0700
Subject: signals: SI_USER: Masquerade si_pid when crossing pid ns boundary

When sending a signal to a descendant namespace, set ->si_pid to 0 since
the sender does not have a pid in the receiver's namespace.

Note:
	- If rt_sigqueueinfo() sets si_code to SI_USER when sending a
	  signal across a pid namespace boundary, the value in ->si_pid
	  will be cleared to 0.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ba3da25..d803473 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -883,6 +883,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		default:
 			copy_siginfo(&q->info, info);
+			if (from_ancestor_ns)
+				q->info.si_pid = 0;
 			break;
 		}
 	} else if (!is_si_special(info)) {
-- 
cgit v1.1


From 95c3eb76dc07fd81289888ffc42948196b34b444 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:11 -0700
Subject: ptrace: kill __ptrace_detach(), fix ->exit_state check

Move the code from __ptrace_detach() to its single caller and kill this
helper.

Also, fix the ->exit_state check, we shouldn't wake up EXIT_DEAD tasks.
Actually, I think task_is_stopped_or_traced() makes more sense, but this
needs another patch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b..f62a568 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,16 +235,6 @@ out:
 	return retval;
 }
 
-static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
-{
-	child->exit_code = data;
-	/* .. re-parent .. */
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
-}
-
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
@@ -254,10 +244,16 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	write_lock_irq(&tasklist_lock);
 	/* protect against de_thread()->release_task() */
-	if (child->ptrace)
-		__ptrace_detach(child, data);
+	write_lock_irq(&tasklist_lock);
+	if (child->ptrace) {
+		child->exit_code = data;
+
+		__ptrace_unlink(child);
+
+		if (!child->exit_state)
+			wake_up_process(child);
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit v1.1


From 6d69cb87f05eef3b02370b2f7bae608ad2301a00 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:12 -0700
Subject: ptrace: simplify ptrace_exit()->ignoring_children() path

ignoring_children() takes parent->sighand->siglock and checks
k_sigaction[SIGCHLD] atomically.  But this buys nothing, we can't get the
"really" wrong result even if we race with sigaction(SIGCHLD).  If we read
the "stale" sa_handler/sa_flags we can pretend it was changed right after
the check.

Remove spin_lock(->siglock), and kill "int ign" which caches the result of
ignoring_children() which becomes rather trivial.

Perhaps it makes sense to export this helper, do_notify_parent() can use
it too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0c06b9e..7a83114 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,19 +732,15 @@ static void exit_mm(struct task_struct * tsk)
 }
 
 /*
- * Return nonzero if @parent's children should reap themselves.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
+ * Called with irqs disabled, returns true if childs should reap themselves.
  */
-static int ignoring_children(struct task_struct *parent)
+static int ignoring_children(struct sighand_struct *sigh)
 {
 	int ret;
-	struct sighand_struct *psig = parent->sighand;
-	unsigned long flags;
-	spin_lock_irqsave(&psig->siglock, flags);
-	ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
-	       (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
-	spin_unlock_irqrestore(&psig->siglock, flags);
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
 	return ret;
 }
 
@@ -757,7 +753,6 @@ static int ignoring_children(struct task_struct *parent)
 static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
 	struct task_struct *p, *n;
-	int ign = -1;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
 		__ptrace_unlink(p);
@@ -779,12 +774,8 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, parent))
 				do_notify_parent(p, p->exit_signal);
-			else {
-				if (ign < 0)
-					ign = ignoring_children(parent);
-				if (ign)
-					p->exit_signal = -1;
-			}
+			else if (ignoring_children(parent->sighand))
+				p->exit_signal = -1;
 		}
 
 		if (task_detached(p)) {
-- 
cgit v1.1


From b1b4c6799fb59e710454bfe0ab477cb8523a8667 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:13 -0700
Subject: ptrace: reintroduce __ptrace_detach() as a callee of ptrace_exit()

No functional changes, preparation for the next patch.

Move the "should we release this child" logic into the separate handler,
__ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 62 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7a83114..576eae2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -744,6 +744,38 @@ static int ignoring_children(struct sighand_struct *sigh)
 	return ret;
 }
 
+/* Returns nonzero if the tracee should be released. */
+int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state != EXIT_ZOMBIE)
+		return 0;
+	/*
+	 * If it's a zombie, our attachedness prevented normal
+	 * parent notification or self-reaping.  Do notification
+	 * now if it would have happened earlier.  If it should
+	 * reap itself we return true.
+	 *
+	 * If it's our own child, there is no notification to do.
+	 * But if our normal children self-reap, then this child
+	 * was prevented by ptrace and we must reap it now.
+	 */
+	if (!task_detached(p) && thread_group_empty(p)) {
+		if (!same_thread_group(p->real_parent, tracer))
+			do_notify_parent(p, p->exit_signal);
+		else if (ignoring_children(tracer->sighand))
+			p->exit_signal = -1;
+	}
+
+	if (!task_detached(p))
+		return 0;
+
+	/* Mark it as in the process of being reaped. */
+	p->exit_state = EXIT_DEAD;
+	return 1;
+}
+
 /*
  * Detach all tasks we were using ptrace on.
  * Any that need to be release_task'd are put on the @dead list.
@@ -755,36 +787,8 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 	struct task_struct *p, *n;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		__ptrace_unlink(p);
-
-		if (p->exit_state != EXIT_ZOMBIE)
-			continue;
-
-		/*
-		 * If it's a zombie, our attachedness prevented normal
-		 * parent notification or self-reaping.  Do notification
-		 * now if it would have happened earlier.  If it should
-		 * reap itself, add it to the @dead list.  We can't call
-		 * release_task() here because we already hold tasklist_lock.
-		 *
-		 * If it's our own child, there is no notification to do.
-		 * But if our normal children self-reap, then this child
-		 * was prevented by ptrace and we must reap it now.
-		 */
-		if (!task_detached(p) && thread_group_empty(p)) {
-			if (!same_thread_group(p->real_parent, parent))
-				do_notify_parent(p, p->exit_signal);
-			else if (ignoring_children(parent->sighand))
-				p->exit_signal = -1;
-		}
-
-		if (task_detached(p)) {
-			/*
-			 * Mark it as in the process of being reaped.
-			 */
-			p->exit_state = EXIT_DEAD;
+		if (__ptrace_detach(parent, p))
 			list_add(&p->ptrace_entry, dead);
-		}
 	}
 }
 
-- 
cgit v1.1


From 4576145c1ecdaaea9ef8976a48335206aa1ebf91 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:14 -0700
Subject: ptrace: fix possible zombie leak on PTRACE_DETACH

When ptrace_detach() takes tasklist, the tracee can be SIGKILL'ed.  If it
has already passed exit_notify() we can leak a zombie, because a) ptracing
disables the auto-reaping logic, and b) ->real_parent was not notified
about the child's death.

ptrace_detach() should follow the ptrace_exit's logic, change the code
accordingly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Tested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f62a568..ee553b6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,6 +237,8 @@ out:
 
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
+	int dead = 0;
+
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -244,18 +246,21 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	/* protect against de_thread()->release_task() */
 	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
 	if (child->ptrace) {
 		child->exit_code = data;
 
-		__ptrace_unlink(child);
+		dead = __ptrace_detach(current, child);
 
 		if (!child->exit_state)
 			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	if (unlikely(dead))
+		release_task(child);
+
 	return 0;
 }
 
-- 
cgit v1.1


From 0a967a044a777e8b9c739120927114ddc0094298 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:15 -0700
Subject: reparent_thread: don't call kill_orphaned_pgrp() if task_detached()

If task_detached(p) == T, then either

  a) p is not the main thread, we will find the group leader on the
     ->children list.

or

  b) p is the group leader but its ->exit_state = EXIT_DEAD.  This
     can only happen when the last sub-thread has died, but in that case
     that thread has already called kill_orphaned_pgrp() from
     exit_notify().

In both cases kill_orphaned_pgrp() looks bogus.

Move the task_detached() check up and simplify the code, this is also
right from the "common sense" pov: we should do nothing with the detached
childs, except move them to the new parent's ->children list.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 576eae2..405e687 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -818,6 +818,8 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
+	if (task_detached(p))
+		return;
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
@@ -825,15 +827,13 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 		return;
 
 	/* We don't want people slaying init.  */
-	if (!task_detached(p))
-		p->exit_signal = SIGCHLD;
+	p->exit_signal = SIGCHLD;
 
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
 	if (!ptrace_reparented(p) &&
-	    p->exit_state == EXIT_ZOMBIE &&
-	    !task_detached(p) && thread_group_empty(p))
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
 	kill_orphaned_pgrp(p, father);
-- 
cgit v1.1


From b1442b055c154699a6a2c436f3352f71b6beede3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:16 -0700
Subject: reparent_thread: fix the "is it traced" check

reparent_thread() uses ptrace_reparented() to check whether this thread is
ptraced, in that case we should not notify the new parent.

But ptrace_reparented() is not exactly correct when the reparented thread
is traced by /sbin/init, because forget_original_parent() has already
changed ->real_parent.

Currently, the only problem is the false notification.  But with the next
patch the kernel crash in this (yes, pathological) case.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 405e687..5be0a40 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -832,7 +832,7 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!ptrace_reparented(p) &&
+	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
-- 
cgit v1.1


From 7f5d3652d469cdf9eb2365dfea7ce3fb9e1409cc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:17 -0700
Subject: reparent_thread: fix a zombie leak if /sbin/init ignores SIGCHLD

If /sbin/init ignores SIGCHLD and we re-parent a zombie, it is leaked.
reparent_thread() does do_notify_parent() which sets ->exit_signal = -1 in
this case.  This means that nobody except us can reap it, the detached
task is not visible to do_wait().

Change reparent_thread() to return a boolean (like __pthread_detach) to
indicate that the thread is dead and must be released.  Also change
forget_original_parent() to add the child to ptrace_dead list in this
case.

The naming becomes insane, the next patch does the cleanup.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 5be0a40..3e09b7c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -810,8 +810,11 @@ static void ptrace_exit_finish(struct task_struct *parent,
 	}
 }
 
-static void reparent_thread(struct task_struct *p, struct task_struct *father)
+/* Returns nonzero if the child should be released. */
+static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
+	int dead;
+
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
 		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
@@ -819,12 +822,12 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	if (task_detached(p))
-		return;
+		return 0;
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
 	if (same_thread_group(p->real_parent, father))
-		return;
+		return 0;
 
 	/* We don't want people slaying init.  */
 	p->exit_signal = SIGCHLD;
@@ -832,11 +835,19 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
+	dead = 0;
 	if (!p->ptrace &&
-	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
+		if (task_detached(p)) {
+			p->exit_state = EXIT_DEAD;
+			dead = 1;
+		}
+	}
 
 	kill_orphaned_pgrp(p, father);
+
+	return dead;
 }
 
 /*
@@ -896,7 +907,8 @@ static void forget_original_parent(struct task_struct *father)
 			BUG_ON(p->ptrace);
 			p->parent = p->real_parent;
 		}
-		reparent_thread(p, father);
+		if (reparent_thread(p, father))
+			list_add(&p->ptrace_entry, &ptrace_dead);;
 	}
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.1


From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:18 -0700
Subject: forget_original_parent: split out the un-ptrace part

By discussion with Roland.

- Rename ptrace_exit() to exit_ptrace(), and change it to do all the
  necessary work with ->ptraced list by its own.

- Move this code from exit.c to ptrace.c

- Update the comment in ptrace_detach() to explain the rechecking of
  the child->ptrace.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 95 ++++-----------------------------------------------------
 kernel/ptrace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 82 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3e09b7c..506693d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -61,11 +61,6 @@ DEFINE_TRACE(sched_process_wait);
 
 static void exit_mm(struct task_struct * tsk);
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -731,85 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/*
- * Called with irqs disabled, returns true if childs should reap themselves.
- */
-static int ignoring_children(struct sighand_struct *sigh)
-{
-	int ret;
-	spin_lock(&sigh->siglock);
-	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
-	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
-	spin_unlock(&sigh->siglock);
-	return ret;
-}
-
-/* Returns nonzero if the tracee should be released. */
-int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
-{
-	__ptrace_unlink(p);
-
-	if (p->exit_state != EXIT_ZOMBIE)
-		return 0;
-	/*
-	 * If it's a zombie, our attachedness prevented normal
-	 * parent notification or self-reaping.  Do notification
-	 * now if it would have happened earlier.  If it should
-	 * reap itself we return true.
-	 *
-	 * If it's our own child, there is no notification to do.
-	 * But if our normal children self-reap, then this child
-	 * was prevented by ptrace and we must reap it now.
-	 */
-	if (!task_detached(p) && thread_group_empty(p)) {
-		if (!same_thread_group(p->real_parent, tracer))
-			do_notify_parent(p, p->exit_signal);
-		else if (ignoring_children(tracer->sighand))
-			p->exit_signal = -1;
-	}
-
-	if (!task_detached(p))
-		return 0;
-
-	/* Mark it as in the process of being reaped. */
-	p->exit_state = EXIT_DEAD;
-	return 1;
-}
-
-/*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		if (__ptrace_detach(parent, p))
-			list_add(&p->ptrace_entry, dead);
-	}
-}
-
-/*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
-static void ptrace_exit_finish(struct task_struct *parent,
-			       struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	BUG_ON(!list_empty(&parent->ptraced));
-
-	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
-	}
-}
-
 /* Returns nonzero if the child should be released. */
 static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
@@ -894,12 +810,10 @@ static void forget_original_parent(struct task_struct *father)
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(ptrace_dead);
 
+	exit_ptrace(father);
+
 	write_lock_irq(&tasklist_lock);
 	reaper = find_new_reaper(father);
-	/*
-	 * First clean up ptrace if we were using it.
-	 */
-	ptrace_exit(father, &ptrace_dead);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
@@ -914,7 +828,10 @@ static void forget_original_parent(struct task_struct *father)
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 
-	ptrace_exit_finish(father, &ptrace_dead);
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
 }
 
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee553b6..f5a9fa5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,9 +235,57 @@ out:
 	return retval;
 }
 
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+	int ret;
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
+	return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping.  Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state == EXIT_ZOMBIE) {
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, tracer))
+				do_notify_parent(p, p->exit_signal);
+			else if (ignoring_children(tracer->sighand))
+				p->exit_signal = -1;
+		}
+		if (task_detached(p)) {
+			/* Mark it as in the process of being reaped. */
+			p->exit_state = EXIT_DEAD;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-	int dead = 0;
+	bool dead = false;
 
 	if (!valid_signal(data))
 		return -EIO;
@@ -247,7 +295,10 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
-	/* protect against de_thread()->release_task() */
+	/*
+	 * This child can be already killed. Make sure de_thread() or
+	 * our sub-thread doing do_wait() didn't do release_task() yet.
+	 */
 	if (child->ptrace) {
 		child->exit_code = data;
 
@@ -264,6 +315,29 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
+/*
+ * Detach all tasks we were using ptrace on.
+ */
+void exit_ptrace(struct task_struct *tracer)
+{
+	struct task_struct *p, *n;
+	LIST_HEAD(ptrace_dead);
+
+	write_lock_irq(&tasklist_lock);
+	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+		if (__ptrace_detach(tracer, p))
+			list_add(&p->ptrace_entry, &ptrace_dead);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	BUG_ON(!list_empty(&tracer->ptraced));
+
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
-- 
cgit v1.1


From 5dfc80be73dd0c212d2e6dd8dbf5afa07e680bbe Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:19 -0700
Subject: forget_original_parent: do not abuse child->ptrace_entry

By discussion with Roland.

- Use ->sibling instead of ->ptrace_entry to chain the need to be
  release_task'd childs. Nobody else can use ->sibling, this task
  is EXIT_DEAD and nobody can find it on its own list.

- rename ptrace_dead to dead_childs.

- Now that we don't have the "parallel" untrace code, change back
  reparent_thread() to return void, pass dead_childs as an argument.

Actually, I don't understand why do we notify /sbin/init when we
reparent a zombie, probably it is better to reap it unconditionally.

[akpm@linux-foundation.org: s/childs/children/]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 87 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 41 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 506693d..029415d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -726,46 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/* Returns nonzero if the child should be released. */
-static int reparent_thread(struct task_struct *p, struct task_struct *father)
-{
-	int dead;
-
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
-
-	list_move_tail(&p->sibling, &p->real_parent->children);
-
-	if (task_detached(p))
-		return 0;
-	/* If this is a threaded reparent there is no need to
-	 * notify anyone anything has happened.
-	 */
-	if (same_thread_group(p->real_parent, father))
-		return 0;
-
-	/* We don't want people slaying init.  */
-	p->exit_signal = SIGCHLD;
-
-	/* If we'd notified the old parent about this child's death,
-	 * also notify the new parent.
-	 */
-	dead = 0;
-	if (!p->ptrace &&
-	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-		do_notify_parent(p, p->exit_signal);
-		if (task_detached(p)) {
-			p->exit_state = EXIT_DEAD;
-			dead = 1;
-		}
-	}
-
-	kill_orphaned_pgrp(p, father);
-
-	return dead;
-}
-
 /*
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our thread
@@ -805,10 +765,46 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 	return pid_ns->child_reaper;
 }
 
+/*
+* Any that need to be release_task'd are put on the @dead list.
+ */
+static void reparent_thread(struct task_struct *father, struct task_struct *p,
+				struct list_head *dead)
+{
+	if (p->pdeath_signal)
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
+
+	if (task_detached(p))
+		return;
+	/*
+	 * If this is a threaded reparent there is no need to
+	 * notify anyone anything has happened.
+	 */
+	if (same_thread_group(p->real_parent, father))
+		return;
+
+	/* We don't want people slaying init.  */
+	p->exit_signal = SIGCHLD;
+
+	/* If it has exited notify the new parent about this child's death. */
+	if (!p->ptrace &&
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+		do_notify_parent(p, p->exit_signal);
+		if (task_detached(p)) {
+			p->exit_state = EXIT_DEAD;
+			list_move_tail(&p->sibling, dead);
+		}
+	}
+
+	kill_orphaned_pgrp(p, father);
+}
+
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper;
-	LIST_HEAD(ptrace_dead);
+	LIST_HEAD(dead_children);
 
 	exit_ptrace(father);
 
@@ -821,15 +817,14 @@ static void forget_original_parent(struct task_struct *father)
 			BUG_ON(p->ptrace);
 			p->parent = p->real_parent;
 		}
-		if (reparent_thread(p, father))
-			list_add(&p->ptrace_entry, &ptrace_dead);;
+		reparent_thread(father, p, &dead_children);
 	}
-
 	write_unlock_irq(&tasklist_lock);
+
 	BUG_ON(!list_empty(&father->children));
 
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
+	list_for_each_entry_safe(p, n, &dead_children, sibling) {
+		list_del_init(&p->sibling);
 		release_task(p);
 	}
 }
-- 
cgit v1.1


From 95a3540da9c81a5987be810e1d9a83640a366bd5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:21 -0700
Subject: ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic

Another ancient bug. Consider this trivial test-case,

	int main(void)
	{
		int pid = fork();

		if (pid) {
			ptrace(PTRACE_ATTACH, pid, NULL, NULL);
			wait(NULL);
			ptrace(PTRACE_DETACH, pid, NULL, NULL);
		} else {
			pause();
			printf("WE HAVE A KERNEL BUG!!!\n");
		}

		return 0;
	}

the child must not "escape" for sys_pause(), but it can and this was seen
in practice.

This is because ptrace_detach does:

	if (!child->exit_state)
		wake_up_process(child);

this wakeup can happen after this child has already restarted sys_pause(),
because it gets another wakeup from ptrace_untrace().

With or without this patch, perhaps sys_pause() needs a fix.  But this
wakeup also breaks the SIGNAL_STOP_STOPPED logic in ptrace_untrace().

Remove this wakeup.  The caller saw this task in TASK_TRACED state, and
unless it was SIGKILL'ed in between __ptrace_unlink()->ptrace_untrace()
should handle this case correctly.  If it was SIGKILL'ed, we don't need to
wakup the dying tracee too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f5a9fa5..296e810 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -301,11 +301,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	 */
 	if (child->ptrace) {
 		child->exit_code = data;
-
 		dead = __ptrace_detach(current, child);
-
-		if (!child->exit_state)
-			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.1


From 1ee1184485df9c9a3503d3a684b911fb7c73d259 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:23 -0700
Subject: ptrace_untrace: fix the SIGNAL_STOP_STOPPED check

This bug is ancient too. ptrace_untrace() must not resume the task
if the group stop in progress, we should set TASK_STOPPED instead.

Unfortunately, we still have problems here:

	- if the process/thread was traced, SIGNAL_STOP_STOPPED
	  does not necessary means this thread group is stopped.

	- ptrace breaks the bookkeeping of ->group_stop_count.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 296e810..5105f5a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
 {
 	spin_lock(&child->sighand->siglock);
 	if (task_is_traced(child)) {
-		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+		/*
+		 * If the group stop is completed or in progress,
+		 * this thread was already counted as stopped.
+		 */
+		if (child->signal->flags & SIGNAL_STOP_STOPPED ||
+		    child->signal->group_stop_count)
 			__set_task_state(child, TASK_STOPPED);
-		} else {
+		else
 			signal_wake_up(child, 1);
-		}
 	}
 	spin_unlock(&child->sighand->siglock);
 }
-- 
cgit v1.1


From 2355b70fd59cb5be7de2052a9edeee7afb7ff099 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:58:24 -0700
Subject: workqueue: avoid recursion in run_workqueue()

1) lockdep will complain when run_workqueue() performs recursion.

2) The recursive implementation of run_workqueue() means that
   flush_workqueue() and its documentation are inconsistent.  This may
   hide deadlocks and other bugs.

3) The recursion in run_workqueue() will poison cwq->current_work, but
   flush_work() and __cancel_work_timer(), etcetera need a reliable
   cwq->current_work.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9aedd9f..32f8e0d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,8 +48,6 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
-
-	int run_depth;		/* Detect run_workqueue() recursion depth */
 } ____cacheline_aligned;
 
 /*
@@ -262,13 +260,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
-	cwq->run_depth++;
-	if (cwq->run_depth > 3) {
-		/* morton gets to eat his hat */
-		printk("%s: recursion depth exceeded: %d\n",
-			__func__, cwq->run_depth);
-		dump_stack();
-	}
 	while (!list_empty(&cwq->worklist)) {
 		struct work_struct *work = list_entry(cwq->worklist.next,
 						struct work_struct, entry);
@@ -311,7 +302,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		spin_lock_irq(&cwq->lock);
 		cwq->current_work = NULL;
 	}
-	cwq->run_depth--;
 	spin_unlock_irq(&cwq->lock);
 }
 
@@ -368,29 +358,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
-	int active;
+	int active = 0;
+	struct wq_barrier barr;
 
-	if (cwq->thread == current) {
-		/*
-		 * Probably keventd trying to flush its own queue. So simply run
-		 * it by hand rather than deadlocking.
-		 */
-		run_workqueue(cwq);
-		active = 1;
-	} else {
-		struct wq_barrier barr;
+	WARN_ON(cwq->thread == current);
 
-		active = 0;
-		spin_lock_irq(&cwq->lock);
-		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-			insert_wq_barrier(cwq, &barr, &cwq->worklist);
-			active = 1;
-		}
-		spin_unlock_irq(&cwq->lock);
-
-		if (active)
-			wait_for_completion(&barr.done);
+	spin_lock_irq(&cwq->lock);
+	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+		insert_wq_barrier(cwq, &barr, &cwq->worklist);
+		active = 1;
 	}
+	spin_unlock_irq(&cwq->lock);
+
+	if (active)
+		wait_for_completion(&barr.done);
 
 	return active;
 }
-- 
cgit v1.1


From 11dea1900931ac73184b2f5163a13d24a4e572ea Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 2 Apr 2009 16:58:27 -0700
Subject: proc_sysctl: use CONFIG_PROC_SYSCTL around ipc and utsname
 proc_handlers

As pointed out by Cedric Le Goater (in response to Alexey's original
comment wrt mqns), ipc_sysctl.c and utsname_sysctl.c are using
CONFIG_PROC_FS, not CONFIG_PROC_SYSCTL, to determine whether to define
the proc_handlers.  Change that.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/utsname_sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b35..92359cc 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
 		up_write(&uts_sem);
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 /*
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
-- 
cgit v1.1


From 8e654fba4a376f436bdfe361fc5cdbc87ac09b35 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 2 Apr 2009 16:58:33 -0700
Subject: sysctl: fix suid_dumpable and lease-break-time sysctls

Arne de Bruijn points out that commit
76fdbb25f963de5dc1e308325f0578a2f92b1c2d ("coredump masking: bound
suid_dumpable sysctl") mistakenly limits lease-break-time instead of
suid_dumpable.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Reported-by: Arne de Bruijn <kernelbt@arbruijn.dds.nl>
Cc: Kawai, Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2e490a3..5ec4543 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,12 +95,9 @@ static int sixty = 60;
 static int neg_one = -1;
 #endif
 
-#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
-static int two = 2;
-#endif
-
 static int zero;
 static int one = 1;
+static int two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
@@ -1373,10 +1370,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &lease_break_time,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-		.extra2		= &two,
+		.proc_handler	= &proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_AIO
@@ -1417,7 +1411,10 @@ static struct ctl_table fs_table[] = {
 		.data		= &suid_dumpable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 	{
-- 
cgit v1.1


From 2ae448efc87df6d328f5835969076c7f9fce59c3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:36 -0700
Subject: pids: improve get_task_pid() to fix the unsafe
 sys_wait4()->task_pgrp()

sys_wait4() does get_pid(task_pgrp(current)), this is not safe.  We can
add rcu lock/unlock around, but we already have get_task_pid() which can
be improved to handle the special pids in more reliable manner.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 kernel/pid.c  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 029415d..384f09c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1737,7 +1737,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		pid = find_get_pid(-upid);
 	} else if (upid == 0) {
 		type = PIDTYPE_PGID;
-		pid = get_pid(task_pgrp(current));
+		pid = get_task_pid(current, PIDTYPE_PGID);
 	} else /* upid > 0 */ {
 		type = PIDTYPE_PID;
 		pid = find_get_pid(upid);
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586f..6628abc 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid;
 	rcu_read_lock();
+	if (type != PIDTYPE_PID)
+		task = task->group_leader;
 	pid = get_pid(task->pids[type].pid);
 	rcu_read_unlock();
 	return pid;
-- 
cgit v1.1


From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:38 -0700
Subject: pids: refactor vnr/nr_ns helpers to make them safe

Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.

task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.

As for "special" pids, vnr/nr_ns helpers always need rcu.  However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.

And almost every helper has a callsite which needs a fix.

Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".

This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.

After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 6628abc..b2e5f78 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -452,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns)
 {
-	return pid_nr_ns(task_pid(tsk), ns);
+	pid_t nr = 0;
+
+	rcu_read_lock();
+	if (!ns)
+		ns = current->nsproxy->pid_ns;
+	if (likely(pid_alive(task))) {
+		if (type != PIDTYPE_PID)
+			task = task->group_leader;
+		nr = pid_nr_ns(task->pids[type].pid, ns);
+	}
+	rcu_read_unlock();
+
+	return nr;
 }
-EXPORT_SYMBOL(task_pid_nr_ns);
+EXPORT_SYMBOL(__task_pid_nr_ns);
 
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
@@ -464,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_tgid_nr_ns);
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_pgrp(tsk), ns);
-}
-EXPORT_SYMBOL(task_pgrp_nr_ns);
-
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_session(tsk), ns);
-}
-EXPORT_SYMBOL(task_session_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
-- 
cgit v1.1


From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:39 -0700
Subject: pids: kill signal_struct-> __pgrp/__session and friends

We are wasting 2 words in signal_struct without any reason to implement
task_pgrp_nr() and task_session_nr().

task_session_nr() has no callers since
2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it.

task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and
fs/coda.

This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills
__pgrp/__session and the related helpers.

The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense
anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alan Cox <number6@the-village.bc.nu>		[tty parts]
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 10 +++-------
 kernel/fork.c |  2 --
 kernel/sys.c  |  4 +---
 3 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 384f09c..3bec141 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void)
 void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
-	pid_t nr = pid_nr(pid);
 
-	if (task_session(curr) != pid) {
+	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
-		set_task_session(curr, nr);
-	}
-	if (task_pgrp(curr) != pid) {
+
+	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
-		set_task_pgrp(curr, nr);
-	}
 }
 
 static void set_special_pids(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index adbea16..f744582 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			p->signal->leader_pid = pid;
 			tty_kref_put(p->signal->tty);
 			p->signal->tty = tty_kref_get(current->signal->tty);
-			set_task_pgrp(p, task_pgrp_nr(current));
-			set_task_session(p, task_session_nr(current));
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e..742cefa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	if (err)
 		goto out;
 
-	if (task_pgrp(p) != pgrp) {
+	if (task_pgrp(p) != pgrp)
 		change_pid(p, PIDTYPE_PGID, pgrp);
-		set_task_pgrp(p, pid_nr(pgrp));
-	}
 
 	err = 0;
 out:
-- 
cgit v1.1


From 04d491ab2a53008a1aa98ac09561768c7f3adda3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 2 Apr 2009 16:58:57 -0700
Subject: kexec: add dmesg log symbols to /proc/vmcoreinfo lists

It would be nice to be able to extract the dmesg log from a vmcore file
without needing to keep the debug symbols for the running kernel handy all
the time.  We have a facility to do this in /proc/vmcore.  This patch adds
the log_buf and log_end symbols to the vmcoreinfo area so that tools (like
makedumpfile) can easily extract the dmesg logs from a vmcore image.

[akpm@linux-foundation.org: several fixes and cleanups]
[akpm@linux-foundation.org: fix unused log_buf_kexec_setup()]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c  |  1 +
 kernel/printk.c | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 93eed85..589832a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vm_struct, addr);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
 	VMCOREINFO_NUMBER(PG_lru);
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0..a5f61a9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcoreinfo
+ *
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate.  These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+	VMCOREINFO_SYMBOL(log_buf);
+	VMCOREINFO_SYMBOL(log_end);
+	VMCOREINFO_SYMBOL(log_buf_len);
+	VMCOREINFO_SYMBOL(logged_chars);
+}
+#endif
+
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned size = memparse(str, &str);
-- 
cgit v1.1


From edb79a213223488735fae1d408f4c136e9ed25d6 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Thu, 2 Apr 2009 16:58:58 -0700
Subject: kexec: vmcoreinfo_data[] can become static

The vmcoreinfo_data[] array is not used outside of kernel/kexec.c, and
can therefore become static. This patch adds the relevant keyword to the
definition of the array.

Noticed by sparse.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 589832a..5a758c6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
 note_buf_t* crash_notes;
 
 /* vmcoreinfo stuff */
-unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-- 
cgit v1.1


From 2c53d9109f077900e140edb8b766132ad93b81cc Mon Sep 17 00:00:00 2001
From: Aravind Srinivasan <raa.aars@gmail.com>
Date: Thu, 2 Apr 2009 16:58:59 -0700
Subject: relay: fix for possible loss/corruption of produced subbufs

Fix possible loss/corruption of produced subbufs in
relay_subbufs_consumed().

When buf->subbufs_produced wraps around after UINT_MAX and
buf->subbufs_consumed is still < UINT_MAX, the condition

	if (buf->subbufs_consumed > buf->subbufs_produced)

will be true even for certain valid values of subbufs_consumed.  This may
lead to loss or corruption of produced subbufs.

Signed-off-by: Aravind Srinivasan <raa.aars@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 8f2179c..e92db8c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -797,13 +797,15 @@ void relay_subbufs_consumed(struct rchan *chan,
 	if (!chan)
 		return;
 
-	if (cpu >= NR_CPUS || !chan->buf[cpu])
+	if (cpu >= NR_CPUS || !chan->buf[cpu] ||
+					subbufs_consumed > chan->n_subbufs)
 		return;
 
 	buf = chan->buf[cpu];
-	buf->subbufs_consumed += subbufs_consumed;
-	if (buf->subbufs_consumed > buf->subbufs_produced)
+	if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
 		buf->subbufs_consumed = buf->subbufs_produced;
+	else
+		buf->subbufs_consumed += subbufs_consumed;
 }
 EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
 
-- 
cgit v1.1


From e8c158bb313c1df421eab7dc4299cd39cbbf5895 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:45 -0700
Subject: Factor out #ifdefs from kernel/spinlock.c to LOCK_CONTENDED_FLAGS

SGI has observed that on large systems, interrupts are not serviced for a
long period of time when waiting for a rwlock.  The following patch series
re-enables irqs while waiting for the lock, resembling the code which is
already there for spinlocks.

I only made the ia64 version, because the patch adds some overhead to the
fast path.  I assume there is currently no demand to have this for other
architectures, because the systems are not so large.  Of course, the
possibility to implement raw_{read|write}_lock_flags for any architecture
is still there.

This patch:

The new macro LOCK_CONTENDED_FLAGS expands to the correct implementation
depending on the config options, so that IRQ's are re-enabled when
possible, but they remain disabled if CONFIG_LOCKDEP is set.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/spinlock.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab207..7283c6d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -299,16 +299,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * _raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
-	_raw_spin_lock_flags(lock, &flags);
-#endif
+	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+				_raw_spin_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
-- 
cgit v1.1


From f5f7eac41db827a47b2163330eecd7bb55ae9f12 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:46 -0700
Subject: Allow rwlocks to re-enable interrupts

Pass the original flags to rwlock arch-code, so that it can re-enable
interrupts if implemented for that architecture.

Initially, make __raw_read_lock_flags and __raw_write_lock_flags stubs
which just do the same thing as non-flags variants.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/spinlock.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7283c6d..7932653 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
+			     _raw_read_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
+			     _raw_write_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
-- 
cgit v1.1


From b1f77b0581b8fd837acb4a973f7d5496cae6efee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 03:20:49 +0100
Subject: kmemtrace, rcu: fix linux/rcutree.h and linux/rcuclassic.h
 dependencies

Impact: build fix for all non-x86 architectures

We want to remove percpu.h from rcuclassic.h/rcutree.h (for upcoming
kmemtrace changes) but that would break the DECLARE_PER_CPU based
declarations in these files.

Move the quiescent counter management functions to their respective
RCU implementation .c files - they were slightly above the inlining
limit anyway.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 23 +++++++++++++++++++++--
 kernel/rcutree.c    | 28 ++++++++++++++++++++++++----
 2 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640..0f2b0b3 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_BITS_NONE,
 };
+
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cpumask = CPU_BITS_NONE,
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
 
 static int blimit = 10;
 static int qhimark = 10000;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce315..a2015ed 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,31 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
 
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-- 
cgit v1.1


From 6258c4fb59e77d748f7efc2c137ad420372edd07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcu_tree_trace.c data structure dependencies

Impact: cleanup

We want to remove rcutree internals from the public rcutree.h file for
upcoming kmemtrace changes - but kernel/rcutree_trace.c depends on them.

Introduce kernel/rcutree.h for internal definitions. (Probably all
the other data types from include/linux/rcutree.h could be
moved here too - except rcu_data.)

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c       |  8 ++++----
 kernel/rcutree.h       | 10 ++++++++++
 kernel/rcutree_trace.c |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 kernel/rcutree.h

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a2015ed..7f32669 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,11 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
  * Increment the quiescent state counter.
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 0000000..5e872bb
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
+
+/*
+ * RCU implementation internal declarations:
+ */
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e83..4ee954f 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+#include "rcutree.h"
+
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
-- 
cgit v1.1


From a979241c532f07c201fe94e0a632107268f02578 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcupreempt.c data structure dependencies

Impact: cleanup

We want to remove percpu.h from rcupreempt.h, but if we do that
the percpu primitives there wont build anymore. Move them to the
.c file instead.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e85..ce97a4d 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
 	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
+struct rcu_dyntick_sched {
+	int dynticks;
+	int dynticks_snap;
+	int sched_qs;
+	int sched_qs_snap;
+	int sched_dynticks_snap;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
+
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs++;
+}
+
+#ifdef CONFIG_NO_HZ
+
+void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
+}
+
+void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
+				&rs);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
-	.dynticks = 1,
-};
-
 #ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
-- 
cgit v1.1


From ca2b84cb3c4a0d4d2143b46ec072cdff5d1b3b87 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:24 +0200
Subject: kmemtrace: use tracepoints

kmemtrace now uses tracepoints instead of markers. We no longer need to
use format specifiers to pass arguments.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
[ folded: Use the new TP_PROTO and TP_ARGS to fix the build.     ]
[ folded: fix build when CONFIG_KMEMTRACE is disabled.           ]
[ folded: define tracepoints when CONFIG_TRACEPOINTS is enabled. ]
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <ae61c0f37156db8ec8dc0d5778018edde60a92e3.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 206 +++++++++++++++++++++++++++++++++--------------
 kernel/trace/trace.h     |   6 ++
 2 files changed, 151 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3..4f7b5db 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/tracepoint.h>
 #include <trace/kmemtrace.h>
 
 #include "trace.h"
@@ -29,10 +30,150 @@ static struct tracer_flags kmem_tracer_flags = {
 	.opts = kmem_opts
 };
 
-
-static bool kmem_tracing_enabled __read_mostly;
 static struct trace_array *kmemtrace_array;
 
+/* Trace allocations */
+static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
+				   unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
+				  unsigned long call_site,
+				  const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static void kmemtrace_kmalloc(unsigned long call_site,
+			      const void *ptr,
+			      size_t bytes_req,
+			      size_t bytes_alloc,
+			      gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+				       const void *ptr,
+				       size_t bytes_req,
+				       size_t bytes_alloc,
+				       gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmalloc_node(unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+					    const void *ptr,
+					    size_t bytes_req,
+					    size_t bytes_alloc,
+					    gfp_t gfp_flags,
+					    int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
+}
+
+static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
+}
+
+static int kmemtrace_start_probes(void)
+{
+	int err;
+
+	err = register_trace_kmalloc(kmemtrace_kmalloc);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	if (err)
+		return err;
+	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	if (err)
+		return err;
+	err = register_trace_kfree(kmemtrace_kfree);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+
+	return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+	unregister_trace_kmalloc(kmemtrace_kmalloc);
+	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	unregister_trace_kfree(kmemtrace_kfree);
+	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+}
+
 static int kmem_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -41,14 +182,14 @@ static int kmem_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	kmem_tracing_enabled = true;
+	kmemtrace_start_probes();
 
 	return 0;
 }
 
 static void kmem_trace_reset(struct trace_array *tr)
 {
-	kmem_tracing_enabled = false;
+	kmemtrace_stop_probes();
 }
 
 static void kmemtrace_headers(struct seq_file *s)
@@ -260,63 +401,6 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	}
 }
 
-/* Trace allocations */
-void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-			     unsigned long call_site,
-			     const void *ptr,
-			     size_t bytes_req,
-			     size_t bytes_alloc,
-			     gfp_t gfp_flags,
-			     int node)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
-
-void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-		       unsigned long call_site,
-		       const void *ptr)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_free);
-
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
 	.init		= kmem_trace_init,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3f..cbc168f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
 struct kmemtrace_alloc_entry {
 	struct trace_entry	ent;
 	enum kmemtrace_type_id type_id;
-- 
cgit v1.1


From da2635a9854423b4aa3a5f0e4e6efcc39ac99004 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:25 +0200
Subject: kmemtrace: kmemtrace_alloc() must fill type_id

Impact: fix trace output

kmemtrace_alloc() was not filling type_id, which allowed garbage to make
it into tracing data.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <284dba2732a144849d5aa82258fe0de2ad8dcb0b.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 4f7b5db..ae259f0 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -52,6 +52,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
 	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->type_id = type_id;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 	entry->bytes_req = bytes_req;
-- 
cgit v1.1


From 42af9054c0eeed09ec58d13ec8bf52d225ebcfcc Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:26 +0200
Subject: kmemtrace: restore original tracing data binary format, improve ABI

When kmemtrace was ported to ftrace, the marker strings were taken as
an indication of how the traced data was being exposed to the userspace.
However, the actual format had been binary, not text.

This restores the original binary format, while also adding an origin CPU
field (since ftrace doesn't expose the data per-CPU to userspace), and
re-adding the timestamp field. It also drops arch-independent field
sizing where it didn't make sense, so pointers won't always be 64 bits
wide like they used to.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <161be9ca8a27b432c4a6ab79f47788c4521652ae.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 82 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae259f0..d8c2d0c 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -208,47 +208,81 @@ static void kmemtrace_headers(struct seq_file *s)
 }
 
 /*
- * The two following functions give the original output from kmemtrace,
- * or something close to....perhaps they need some missing things
+ * The following functions give the original output from kmemtrace,
+ * plus the origin CPU, since reordering occurs in-kernel now.
  */
+
+#define KMEMTRACE_USER_ALLOC	0
+#define KMEMTRACE_USER_FREE	1
+
+struct kmemtrace_user_event {
+	u8		event_id;
+	u8		type_id;
+	u16		event_size;
+	u32		cpu;
+	u64		timestamp;
+	unsigned long	call_site;
+	unsigned long	ptr;
+};
+
+struct kmemtrace_user_event_alloc {
+	size_t		bytes_req;
+	size_t		bytes_alloc;
+	unsigned	gfp_flags;
+	int		node;
+};
+
 static enum print_line_t
-kmemtrace_print_alloc_original(struct trace_iterator *iter,
-				struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc_user(struct trace_iterator *iter,
+			   struct kmemtrace_alloc_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	/* Taken from the old linux/kmemtrace.h */
-	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
-	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
-	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
-	   (unsigned long) entry->gfp_flags, entry->node);
+	struct kmemtrace_user_event *ev;
+	struct kmemtrace_user_event_alloc *ev_alloc;
 
-	if (!ret)
+	ev = trace_seq_reserve(s, sizeof(*ev));
+	if (!ev)
+		return TRACE_TYPE_PARTIAL_LINE;
+	ev->event_id = KMEMTRACE_USER_ALLOC;
+	ev->type_id = entry->type_id;
+	ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
+	ev->cpu = iter->cpu;
+	ev->timestamp = iter->ts;
+	ev->call_site = entry->call_site; 
+	ev->ptr = (unsigned long) entry->ptr;
+
+	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
+	if (!ev_alloc)
 		return TRACE_TYPE_PARTIAL_LINE;
+	ev_alloc->bytes_req = entry->bytes_req;
+	ev_alloc->bytes_alloc = entry->bytes_alloc;
+	ev_alloc->gfp_flags = entry->gfp_flags;
+	ev_alloc->node = entry->node;
 
 	return TRACE_TYPE_HANDLED;
 }
 
 static enum print_line_t
-kmemtrace_print_free_original(struct trace_iterator *iter,
-				struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_user(struct trace_iterator *iter,
+			  struct kmemtrace_free_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
-	int ret;
+	struct kmemtrace_user_event *ev;
 
-	/* Taken from the old linux/kmemtrace.h */
-	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
-	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
-
-	if (!ret)
+	ev = trace_seq_reserve(s, sizeof(*ev));
+	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
+	ev->event_id = KMEMTRACE_USER_FREE;
+	ev->type_id = entry->type_id;
+	ev->event_size = sizeof(*ev);
+	ev->cpu = iter->cpu;
+	ev->timestamp = iter->ts;
+	ev->call_site = entry->call_site; 
+	ev->ptr = (unsigned long) entry->ptr;
 
 	return TRACE_TYPE_HANDLED;
 }
 
-
 /* The two other following provide a more minimalistic output */
 static enum print_line_t
 kmemtrace_print_alloc_compress(struct trace_iterator *iter,
@@ -385,7 +419,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_alloc_compress(iter, field);
 		else
-			return kmemtrace_print_alloc_original(iter, field);
+			return kmemtrace_print_alloc_user(iter, field);
 	}
 
 	case TRACE_KMEM_FREE: {
@@ -394,7 +428,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_free_compress(iter, field);
 		else
-			return kmemtrace_print_free_original(iter, field);
+			return kmemtrace_print_free_user(iter, field);
 	}
 
 	default:
-- 
cgit v1.1


From c826e3cd0c931d60d548f2468122da570d145556 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 16:14:13 +0100
Subject: kmemtrace: small cleanups

Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <161be9ca8a27b432c4a6ab79f47788c4521652ae.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 128 +++++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index d8c2d0c..5011f4d 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -6,15 +6,16 @@
  * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
  */
 
-#include <linux/dcache.h>
+#include <linux/tracepoint.h>
+#include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/dcache.h>
 #include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/tracepoint.h>
+
 #include <trace/kmemtrace.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_KMEM_OPT_MINIMAL	0x1
@@ -26,8 +27,8 @@ static struct tracer_opt kmem_opts[] = {
 };
 
 static struct tracer_flags kmem_tracer_flags = {
-	.val = 0,
-	.opts = kmem_opts
+	.val			= 0,
+	.opts			= kmem_opts
 };
 
 static struct trace_array *kmemtrace_array;
@@ -41,24 +42,25 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   int node)
 {
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
+	struct kmemtrace_alloc_entry *entry;
+	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
-	entry	= ring_buffer_event_data(event);
+
+	entry = ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_ALLOC;
-	entry->type_id = type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
+	entry->ent.type		= TRACE_KMEM_ALLOC;
+	entry->type_id		= type_id;
+	entry->call_site	= call_site;
+	entry->ptr		= ptr;
+	entry->bytes_req	= bytes_req;
+	entry->bytes_alloc	= bytes_alloc;
+	entry->gfp_flags	= gfp_flags;
+	entry->node		= node;
 
 	ring_buffer_unlock_commit(tr->buffer, event);
 
@@ -69,9 +71,9 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  const void *ptr)
 {
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
+	struct kmemtrace_free_entry *entry;
+	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
@@ -79,10 +81,10 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_FREE;
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
+	entry->ent.type		= TRACE_KMEM_FREE;
+	entry->type_id		= type_id;
+	entry->call_site	= call_site;
+	entry->ptr		= ptr;
 
 	ring_buffer_unlock_commit(tr->buffer, event);
 
@@ -216,48 +218,50 @@ static void kmemtrace_headers(struct seq_file *s)
 #define KMEMTRACE_USER_FREE	1
 
 struct kmemtrace_user_event {
-	u8		event_id;
-	u8		type_id;
-	u16		event_size;
-	u32		cpu;
-	u64		timestamp;
-	unsigned long	call_site;
-	unsigned long	ptr;
+	u8			event_id;
+	u8			type_id;
+	u16			event_size;
+	u32			cpu;
+	u64			timestamp;
+	unsigned long		call_site;
+	unsigned long		ptr;
 };
 
 struct kmemtrace_user_event_alloc {
-	size_t		bytes_req;
-	size_t		bytes_alloc;
-	unsigned	gfp_flags;
-	int		node;
+	size_t			bytes_req;
+	size_t			bytes_alloc;
+	unsigned		gfp_flags;
+	int			node;
 };
 
 static enum print_line_t
 kmemtrace_print_alloc_user(struct trace_iterator *iter,
 			   struct kmemtrace_alloc_entry *entry)
 {
+	struct kmemtrace_user_event_alloc *ev_alloc;
 	struct trace_seq *s = &iter->seq;
 	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
 
 	ev = trace_seq_reserve(s, sizeof(*ev));
 	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev->event_id = KMEMTRACE_USER_ALLOC;
-	ev->type_id = entry->type_id;
-	ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu = iter->cpu;
-	ev->timestamp = iter->ts;
-	ev->call_site = entry->call_site; 
-	ev->ptr = (unsigned long) entry->ptr;
+
+	ev->event_id		= KMEMTRACE_USER_ALLOC;
+	ev->type_id		= entry->type_id;
+	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
+	ev->cpu			= iter->cpu;
+	ev->timestamp		= iter->ts;
+	ev->call_site		= entry->call_site;
+	ev->ptr			= (unsigned long)entry->ptr;
 
 	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
 	if (!ev_alloc)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev_alloc->bytes_req = entry->bytes_req;
-	ev_alloc->bytes_alloc = entry->bytes_alloc;
-	ev_alloc->gfp_flags = entry->gfp_flags;
-	ev_alloc->node = entry->node;
+
+	ev_alloc->bytes_req	= entry->bytes_req;
+	ev_alloc->bytes_alloc	= entry->bytes_alloc;
+	ev_alloc->gfp_flags	= entry->gfp_flags;
+	ev_alloc->node		= entry->node;
 
 	return TRACE_TYPE_HANDLED;
 }
@@ -272,13 +276,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
 	ev = trace_seq_reserve(s, sizeof(*ev));
 	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev->event_id = KMEMTRACE_USER_FREE;
-	ev->type_id = entry->type_id;
-	ev->event_size = sizeof(*ev);
-	ev->cpu = iter->cpu;
-	ev->timestamp = iter->ts;
-	ev->call_site = entry->call_site; 
-	ev->ptr = (unsigned long) entry->ptr;
+
+	ev->event_id		= KMEMTRACE_USER_FREE;
+	ev->type_id		= entry->type_id;
+	ev->event_size		= sizeof(*ev);
+	ev->cpu			= iter->cpu;
+	ev->timestamp		= iter->ts;
+	ev->call_site		= entry->call_site;
+	ev->ptr			= (unsigned long)entry->ptr;
 
 	return TRACE_TYPE_HANDLED;
 }
@@ -354,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 
 static enum print_line_t
 kmemtrace_print_free_compress(struct trace_iterator *iter,
-				struct kmemtrace_free_entry *entry)
+			      struct kmemtrace_free_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
 	int ret;
@@ -415,6 +420,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	switch (entry->type) {
 	case TRACE_KMEM_ALLOC: {
 		struct kmemtrace_alloc_entry *field;
+
 		trace_assign_type(field, entry);
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_alloc_compress(iter, field);
@@ -424,6 +430,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 
 	case TRACE_KMEM_FREE: {
 		struct kmemtrace_free_entry *field;
+
 		trace_assign_type(field, entry);
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_free_compress(iter, field);
@@ -437,12 +444,12 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 }
 
 static struct tracer kmem_tracer __read_mostly = {
-	.name		= "kmemtrace",
-	.init		= kmem_trace_init,
-	.reset		= kmem_trace_reset,
-	.print_line	= kmemtrace_print_line,
-	.print_header = kmemtrace_headers,
-	.flags		= &kmem_tracer_flags
+	.name			= "kmemtrace",
+	.init			= kmem_trace_init,
+	.reset			= kmem_trace_reset,
+	.print_line		= kmemtrace_print_line,
+	.print_header		= kmemtrace_headers,
+	.flags			= &kmem_tracer_flags
 };
 
 void kmemtrace_init(void)
@@ -454,5 +461,4 @@ static int __init init_kmem_tracer(void)
 {
 	return register_tracer(&kmem_tracer);
 }
-
 device_initcall(init_kmem_tracer);
-- 
cgit v1.1


From a4b3ada83d06554d307dd54abdc62b2e5648264a Mon Sep 17 00:00:00 2001
From: Carl Henrik Lunde <chlunde@ping.uio.no>
Date: Fri, 3 Apr 2009 14:27:15 +0200
Subject: blktrace: NUL-terminate user space messages

Impact: fix corrupted blkparse output

Make sure messages from user space are NUL-terminated strings,
otherwise we could dump random memory to the block trace file.

Additionally, I've limited the message to BLK_TN_MAX_MSG-1
characters, because the last character would be stripped by
vscnprintf anyway.

Signed-off-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Alan D. Brunelle" <alan.brunelle@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090403122714.GT5178@kernel.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 947c5b3..a400b86 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	char *msg;
 	struct blk_trace *bt;
 
-	if (count > BLK_TN_MAX_MSG)
+	if (count > BLK_TN_MAX_MSG - 1)
 		return -EINVAL;
 
-	msg = kmalloc(count, GFP_KERNEL);
+	msg = kmalloc(count + 1, GFP_KERNEL);
 	if (msg == NULL)
 		return -ENOMEM;
 
@@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 		return -EFAULT;
 	}
 
+	msg[count] = '\0';
 	bt = filp->private_data;
 	__trace_note_message(bt, "%s", msg);
 	kfree(msg);
-- 
cgit v1.1


From 7635b03adf3d7b84da7649b81efa91e6ebf11b85 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 3 Apr 2009 15:31:34 +0800
Subject: blktrace: small cleanup in blk_msg_write()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Alan D. Brunelle" <alan.brunelle@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49D5BB56.7000807@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a400b86..73d7860 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,7 +327,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	char *msg;
 	struct blk_trace *bt;
 
-	if (count > BLK_TN_MAX_MSG - 1)
+	if (count >= BLK_TN_MAX_MSG)
 		return -EINVAL;
 
 	msg = kmalloc(count + 1, GFP_KERNEL);
-- 
cgit v1.1


From e2494e1b42ebac402324105d57646489d19e2b01 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 13:43:26 +0800
Subject: blktrace: fix pdu_len when tracing packet command requests

Impact: output all of packet commands - not just the first 4 / 8 bytes

Since commit d7e3c3249ef23b4617393c69fe464765b4ff1645 ("block: add
large command support"), struct request->cmd has been changed from
unsinged char cmd[BLK_MAX_CDB] to unsigned char *cmd.

v1 -> v2: by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>

- make sure rq->cmd_len is always intialized, and then we can use
  rq->cmd_len instead of BLK_MAX_CDB.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49D4507E.2060602@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 73d7860..b32ff44 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -643,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				sizeof(rq->cmd), rq->cmd);
+				rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
 		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-- 
cgit v1.1


From 07fe7cb7c7c179f473fd9c823348fd3eb5dad369 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Create a dynamically sized pool of threads for doing very slow work
 items

Create a dynamically sized pool of threads for doing very slow work items, such
as invoking mkdir() or rmdir() - things that may take a long time and may
sleep, holding mutexes/semaphores and hogging a thread, and are thus unsuitable
for workqueues.

The number of threads is always at least a settable minimum, but more are
started when there's more work to do, up to a limit.  Because of the nature of
the load, it's not suitable for a 1-thread-per-CPU type pool.  A system with
one CPU may well want several threads.

This is used by FS-Cache to do slow caching operations in the background, such
as looking up, creating or deleting cache objects.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 kernel/Makefile    |   1 +
 kernel/slow-work.c | 388 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 389 insertions(+)
 create mode 100644 kernel/slow-work.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3..bab1dff 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 0000000..5a73927
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,388 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+					     * very slow work */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+/*
+ * The queues of work items and the lock governing access to them.  These are
+ * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls.  A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock.  Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool.  This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+	unsigned vsmax;
+
+	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+	vsmax /= 100;
+	vsmax = max(vsmax, 1U);
+	return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread.  Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+	struct slow_work *work = NULL;
+	unsigned vsmax;
+	bool very_slow;
+
+	vsmax = slow_work_calc_vsmax();
+
+	/* find something to execute */
+	spin_lock_irq(&slow_work_queue_lock);
+	if (!list_empty(&vslow_work_queue) &&
+	    atomic_read(&vslow_work_executing_count) < vsmax) {
+		work = list_entry(vslow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		atomic_inc(&vslow_work_executing_count);
+		very_slow = true;
+	} else if (!list_empty(&slow_work_queue)) {
+		work = list_entry(slow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		very_slow = false;
+	} else {
+		very_slow = false; /* avoid the compiler warning */
+	}
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	if (!work)
+		return false;
+
+	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+		BUG();
+
+	work->ops->execute(work);
+
+	if (very_slow)
+		atomic_dec(&vslow_work_executing_count);
+	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+	/* if someone tried to enqueue the item whilst we were executing it,
+	 * then it'll be left unenqueued to avoid multiple threads trying to
+	 * execute it simultaneously
+	 *
+	 * there is, however, a race between us testing the pending flag and
+	 * getting the spinlock, and between the enqueuer setting the pending
+	 * flag and getting the spinlock, so we use a deferral bit to tell us
+	 * if the enqueuer got there first
+	 */
+	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+			goto auto_requeue;
+
+		spin_unlock_irq(&slow_work_queue_lock);
+	}
+
+	work->ops->put_ref(work);
+	return true;
+
+auto_requeue:
+	/* we must complete the enqueue operation
+	 * - we transfer our ref on the item back to the appropriate queue
+	 * - don't wake another thread up as we're awake already
+	 */
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+		list_add_tail(&work->link, &vslow_work_queue);
+	else
+		list_add_tail(&work->link, &slow_work_queue);
+	spin_unlock_irq(&slow_work_queue_lock);
+	return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing.  If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations.  The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention.  The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute.  This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+	unsigned long flags;
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+	BUG_ON(!work->ops->get_ref);
+
+	/* when honouring an enqueue request, we only promise that we will run
+	 * the work function in the future; we do not promise to run it once
+	 * per enqueue request
+	 *
+	 * we use the PENDING bit to merge together repeat requests without
+	 * having to disable IRQs and take the spinlock, whilst still
+	 * maintaining our promise
+	 */
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		/* we promise that we will not attempt to execute the work
+		 * function in more than one thread simultaneously
+		 *
+		 * this, however, leaves us with a problem if we're asked to
+		 * enqueue the work whilst someone is executing the work
+		 * function as simply queueing the work immediately means that
+		 * another thread may try executing it whilst it is already
+		 * under execution
+		 *
+		 * to deal with this, we set the ENQ_DEFERRED bit instead of
+		 * enqueueing, and the thread currently executing the work
+		 * function will enqueue the work item when the work function
+		 * returns and it has cleared the EXECUTING bit
+		 */
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+		} else {
+			if (work->ops->get_ref(work) < 0)
+				goto cant_get_ref;
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			wake_up(&slow_work_thread_wq);
+		}
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+	return 0;
+
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+	return !list_empty(&slow_work_queue) ||
+		(!list_empty(&vslow_work_queue) &&
+		 atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+	int vsmax;
+
+	DEFINE_WAIT(wait);
+
+	set_freezable();
+	set_user_nice(current, -5);
+
+	for (;;) {
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		prepare_to_wait(&slow_work_thread_wq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (!freezing(current) &&
+		    !slow_work_threads_should_exit &&
+		    !slow_work_available(vsmax))
+			schedule();
+		finish_wait(&slow_work_thread_wq, &wait);
+
+		try_to_freeze();
+
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		if (slow_work_available(vsmax) && slow_work_execute()) {
+			cond_resched();
+			continue;
+		}
+
+		if (slow_work_threads_should_exit)
+			break;
+	}
+
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete_and_exit(&slow_work_last_thread_exited, 0);
+	return 0;
+}
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point.  This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+	struct task_struct *p;
+	int loop;
+
+	mutex_lock(&slow_work_user_lock);
+
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+		init_completion(&slow_work_last_thread_exited);
+
+		slow_work_threads_should_exit = false;
+
+		/* start the minimum number of threads */
+		for (loop = 0; loop < slow_work_min_threads; loop++) {
+			atomic_inc(&slow_work_thread_count);
+			p = kthread_run(slow_work_thread, NULL, "kslowd");
+			if (IS_ERR(p))
+				goto error;
+		}
+		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+	}
+
+	slow_work_user_count++;
+	mutex_unlock(&slow_work_user_lock);
+	return 0;
+
+error:
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete(&slow_work_last_thread_exited);
+	if (loop > 0) {
+		printk(KERN_ERR "Slow work thread pool:"
+		       " Aborting startup on ENOMEM\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_ERR "Slow work thread pool: Aborted\n");
+	}
+	mutex_unlock(&slow_work_user_lock);
+	return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+	mutex_lock(&slow_work_user_lock);
+
+	BUG_ON(slow_work_user_count <= 0);
+
+	slow_work_user_count--;
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_NOTICE "Slow work thread pool:"
+		       " Shut down complete\n");
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+	unsigned nr_cpus = num_possible_cpus();
+
+	if (nr_cpus > slow_work_max_threads)
+		slow_work_max_threads = nr_cpus;
+	return 0;
+}
+
+subsys_initcall(init_slow_work);
-- 
cgit v1.1


From 109d9272c423f46604d45fedfe87e21ee0b25180 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make slow-work thread pool actually dynamic

Make the slow-work thread pool actually dynamic in the number of threads it
contains.  With this patch, it will both create additional threads when it has
extra work to do, and cull excess threads that aren't doing anything.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 kernel/slow-work.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 5a73927..454abb2 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,6 +16,14 @@
 #include <linux/wait.h>
 #include <asm/system.h>
 
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
+					 * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
+					 * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -29,6 +37,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -89,6 +103,14 @@ static bool slow_work_execute(void)
 
 	vsmax = slow_work_calc_vsmax();
 
+	/* see if we can schedule a new thread to be started if we're not
+	 * keeping up with the work */
+	if (!waitqueue_active(&slow_work_thread_wq) &&
+	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+	    !slow_work_may_not_start_new_thread)
+		slow_work_enqueue(&slow_work_new_thread);
+
 	/* find something to execute */
 	spin_lock_irq(&slow_work_queue_lock);
 	if (!list_empty(&vslow_work_queue) &&
@@ -243,6 +265,33 @@ cant_get_ref:
 EXPORT_SYMBOL(slow_work_enqueue);
 
 /*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+	unsigned long flags;
+	bool do_cull = false;
+
+	spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+	if (slow_work_cull) {
+		slow_work_cull = false;
+
+		if (list_empty(&slow_work_queue) &&
+		    list_empty(&vslow_work_queue) &&
+		    atomic_read(&slow_work_thread_count) >
+		    slow_work_min_threads) {
+			mod_timer(&slow_work_cull_timer,
+				  jiffies + SLOW_WORK_CULL_TIMEOUT);
+			do_cull = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return do_cull;
+}
+
+/*
  * Determine if there is slow work available for dispatch
  */
 static inline bool slow_work_available(int vsmax)
@@ -273,7 +322,8 @@ static int slow_work_thread(void *_data)
 				TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !slow_work_threads_should_exit &&
-		    !slow_work_available(vsmax))
+		    !slow_work_available(vsmax) &&
+		    !slow_work_cull)
 			schedule();
 		finish_wait(&slow_work_thread_wq, &wait);
 
@@ -285,11 +335,20 @@ static int slow_work_thread(void *_data)
 
 		if (slow_work_available(vsmax) && slow_work_execute()) {
 			cond_resched();
+			if (list_empty(&slow_work_queue) &&
+			    list_empty(&vslow_work_queue) &&
+			    atomic_read(&slow_work_thread_count) >
+			    slow_work_min_threads)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
 			continue;
 		}
 
 		if (slow_work_threads_should_exit)
 			break;
+
+		if (slow_work_cull && slow_work_cull_thread())
+			break;
 	}
 
 	if (atomic_dec_and_test(&slow_work_thread_count))
@@ -297,6 +356,77 @@ static int slow_work_thread(void *_data)
 	return 0;
 }
 
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+	slow_work_cull = true;
+	wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+	return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+	struct task_struct *p;
+
+	if (slow_work_threads_should_exit)
+		return;
+
+	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+		return;
+
+	if (!mutex_trylock(&slow_work_user_lock))
+		return;
+
+	slow_work_may_not_start_new_thread = true;
+	atomic_inc(&slow_work_thread_count);
+	p = kthread_run(slow_work_thread, NULL, "kslowd");
+	if (IS_ERR(p)) {
+		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+		if (atomic_dec_and_test(&slow_work_thread_count))
+			BUG(); /* we're running on a slow work thread... */
+		mod_timer(&slow_work_oom_timer,
+			  jiffies + SLOW_WORK_OOM_TIMEOUT);
+	} else {
+		/* ratelimit the starting of new threads */
+		mod_timer(&slow_work_oom_timer, jiffies + 1);
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+	.get_ref	= slow_work_new_thread_get_ref,
+	.put_ref	= slow_work_new_thread_put_ref,
+	.execute	= slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+	slow_work_may_not_start_new_thread = false;
+}
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -316,6 +446,10 @@ int slow_work_register_user(void)
 		init_completion(&slow_work_last_thread_exited);
 
 		slow_work_threads_should_exit = false;
+		slow_work_init(&slow_work_new_thread,
+			       &slow_work_new_thread_ops);
+		slow_work_may_not_start_new_thread = false;
+		slow_work_cull = false;
 
 		/* start the minimum number of threads */
 		for (loop = 0; loop < slow_work_min_threads; loop++) {
@@ -369,6 +503,8 @@ void slow_work_unregister_user(void)
 		       " Shut down complete\n");
 	}
 
+	del_timer_sync(&slow_work_cull_timer);
+
 	mutex_unlock(&slow_work_user_lock);
 }
 EXPORT_SYMBOL(slow_work_unregister_user);
-- 
cgit v1.1


From 12e22c5e4bc08ab4b05ac079fe40d9891c5e81a0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make the slow work pool configurable

Make the slow work pool configurable through /proc/sys/kernel/slow-work.

 (*) /proc/sys/kernel/slow-work/min-threads

     The minimum number of threads that should be in the pool as long as it is
     in use.  This may be anywhere between 2 and max-threads.

 (*) /proc/sys/kernel/slow-work/max-threads

     The maximum number of threads that should in the pool.  This may be
     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.

 (*) /proc/sys/kernel/slow-work/vslow-percentage

     The percentage of active threads in the pool that may be used to execute
     very slow work items.  This may be between 1 and 99.  The resultant number
     is bounded to between 1 and one fewer than the number of active threads.
     This ensures there is always at least one thread that can process very
     slow work items, and always at least one thread that won't.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 kernel/slow-work.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c    |   9 ++++
 2 files changed, 125 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 454abb2..3f65900 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <asm/system.h>
 
 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
 					 * things to do */
@@ -24,6 +23,14 @@
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+					void __user *, size_t *, loff_t *);
+#endif
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -34,6 +41,51 @@ static unsigned slow_work_min_threads = 2;
 static unsigned slow_work_max_threads = 4;
 static unsigned vslow_work_proportion = 50; /* % of threads that may process
 					     * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "min-threads",
+		.data		= &slow_work_min_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_min_threads_sysctl,
+		.extra1		= (void *) &slow_work_min_min_threads,
+		.extra2		= &slow_work_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max-threads",
+		.data		= &slow_work_max_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_max_threads_sysctl,
+		.extra1		= &slow_work_min_threads,
+		.extra2		= (void *) &slow_work_max_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "vslow-percentage",
+		.data		= &vslow_work_proportion,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= (void *) &slow_work_min_vslow,
+		.extra2		= (void *) &slow_work_max_vslow,
+	},
+	{ .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
@@ -427,6 +479,64 @@ static void slow_work_oom_timeout(unsigned long data)
 	slow_work_may_not_start_new_thread = false;
 }
 
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to start or stop threads */
+			n = atomic_read(&slow_work_thread_count) -
+				slow_work_min_threads;
+
+			if (n < 0 && !slow_work_may_not_start_new_thread)
+				slow_work_enqueue(&slow_work_new_thread);
+			else if (n > 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to stop threads */
+			n = slow_work_max_threads -
+				atomic_read(&slow_work_thread_count);
+
+			if (n < 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -516,8 +626,12 @@ static int __init init_slow_work(void)
 {
 	unsigned nr_cpus = num_possible_cpus();
 
-	if (nr_cpus > slow_work_max_threads)
+	if (slow_work_max_threads < nr_cpus)
 		slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+	if (slow_work_max_max_threads < nr_cpus * 2)
+		slow_work_max_max_threads = nr_cpus * 2;
+#endif
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ec4543..82350f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &scan_unevictable_handler,
 	},
 #endif
+#ifdef CONFIG_SLOW_WORK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slow-work",
+		.mode		= 0555,
+		.child		= slow_work_sysctls,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.1


From 8f0aa2f25b31ba27db84259141e52ee6ec0d2820 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Document the slow work thread pool

Document the slow work thread pool.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 kernel/slow-work.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 3f65900..cf2bc01 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #include <linux/module.h>
-- 
cgit v1.1


From 9756b15e1b58453a6fd54b85c1ad8515209e10bb Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 30 Mar 2009 20:37:20 -0700
Subject: irq: fix cpumask memory leak on offstack cpumask kernels

Need to free the old cpumask for affinity and pending_mask.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49D18FF0.50707@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/numa_migrate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d612..44bbdcb 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 {
 	free_kstat_irqs(old_desc, desc);
+	free_desc_masks(old_desc, desc);
 	arch_free_chip_data(old_desc, desc);
 }
 
-- 
cgit v1.1


From ca96a895a6bae7efe7b11a35d9f43e6228467562 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 9 Jan 2009 16:44:16 +0100
Subject: audit: EXECVE record - removed bogus newline

(updated)
Added hunk that changes the comment, the rest is the same.

EXECVE records contain a newline after every argument. auditd converts
"\n" to " " so you cannot see newlines even in raw logs, but they're
there nevertheless. If you're not using auditd, you need to work round
them. These '\n' chars are can be easily replaced by spaces when
creating record in kernel. Note there is no need for trailing '\n' in
an audit record.

record before this patch:
"type=EXECVE msg=audit(1231421801.566:31): argc=4 a0=\"./test\"\na1=\"a\"\na2=\"b\"\na3=\"c\"\n"

record after this patch:
"type=EXECVE msg=audit(1231421801.566:31): argc=4 a0=\"./test\" a1=\"a\" a2=\"b\" a3=\"c\""

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2bfc647..738c036 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1024,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
 	char arg_num_len_buf[12];
 	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 3 is the length of a=\n */
+	/* how many digits are in arg_num? 3 is the length of " a=" */
 	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
 	size_t len, len_left, to_send;
 	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1110,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		 * so we can be sure nothing was lost.
 		 */
 		if ((i == 0) && (too_long))
-			audit_log_format(*ab, "a%d_len=%zu ", arg_num,
+			audit_log_format(*ab, " a%d_len=%zu", arg_num,
 					 has_cntl ? 2*len : len);
 
 		/*
@@ -1130,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		buf[to_send] = '\0';
 
 		/* actually log it */
-		audit_log_format(*ab, "a%d", arg_num);
+		audit_log_format(*ab, " a%d", arg_num);
 		if (too_long)
 			audit_log_format(*ab, "[%d]", i);
 		audit_log_format(*ab, "=");
@@ -1138,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 			audit_log_n_hex(*ab, buf, to_send);
 		else
 			audit_log_format(*ab, "\"%s\"", buf);
-		audit_log_format(*ab, "\n");
 
 		p += to_send;
 		len_left -= to_send;
@@ -1166,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
 
 	p = (const char __user *)axi->mm->arg_start;
 
-	audit_log_format(*ab, "argc=%d ", axi->argc);
+	audit_log_format(*ab, "argc=%d", axi->argc);
 
 	/*
 	 * we need some kernel buffer to hold the userspace args.  Just
-- 
cgit v1.1


From 6b96255998053a89f45c0855de954b71f5c3887b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 5 Jan 2009 13:41:13 -0800
Subject: auditsc: fix kernel-doc notation

Fix auditsc kernel-doc notation:

Warning(linux-2.6.28-git7//kernel/auditsc.c:2156): No description found for parameter 'attr'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2156): Excess function parameter 'u_attr' description in '__audit_mq_open'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2204): No description found for parameter 'notification'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2204): Excess function parameter 'u_notification' description in '__audit_mq_notify'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc:	Al Viro <viro@zeniv.linux.org.uk>
cc:	Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 738c036..b344b86 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
  * @mode: mode bits
- * @u_attr: queue attributes
+ * @attr: queue attributes
  *
  */
 void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 /**
  * __audit_mq_notify - record audit data for a POSIX MQ notify
  * @mqdes: MQ descriptor
- * @u_notification: Notification event
+ * @notification: Notification event
  *
  */
 
-- 
cgit v1.1


From c28bb7da74ab74a2860d652493aaff7de104d79e Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <helight.xu@gmail.com>
Date: Thu, 12 Mar 2009 22:16:12 +0800
Subject: make the e->rule.xxx shorter in kernel auditfilter.c

make the e->rule.xxx shorter in kernel/auditfilter.c
--
---------------------------------
Zhenwen Xu - Open and Free
Home Page:	http://zhwen.org
My Studio:	http://dim4.cn

>From 99692dc640b278f1cb1a15646ce42f22e89c0f77 Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <Helight.Xu@gmail.com>
Date: Thu, 12 Mar 2009 22:04:59 +0800
Subject: [PATCH] make the e->rule.xxx shorter in kernel/auditfilter.c

Signed-off-by: Zhenwen Xu <Helight.Xu@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d1..a6fe71f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
 static inline void audit_free_rule(struct audit_entry *e)
 {
 	int i;
-
+	struct audit_krule *erule = &e->rule;
 	/* some rules don't have associated watches */
-	if (e->rule.watch)
-		audit_put_watch(e->rule.watch);
-	if (e->rule.fields)
-		for (i = 0; i < e->rule.field_count; i++) {
-			struct audit_field *f = &e->rule.fields[i];
+	if (erule->watch)
+		audit_put_watch(erule->watch);
+	if (erule->fields)
+		for (i = 0; i < erule->field_count; i++) {
+			struct audit_field *f = &erule->fields[i];
 			kfree(f->lsm_str);
 			security_audit_rule_free(f->lsm_rule);
 		}
-	kfree(e->rule.fields);
-	kfree(e->rule.filterkey);
+	kfree(erule->fields);
+	kfree(erule->filterkey);
 	kfree(e);
 }
 
-- 
cgit v1.1


From b3897f567100d18e0597f638b911d23aa5e0dd23 Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 19 Mar 2009 09:48:27 -0400
Subject: Audit: fix handling of 'strings' with NULL characters

currently audit_log_n_untrustedstring() uses audit_string_contains_control()
to check if the 'string' has any control characters.  If the 'string' has an
embedded NULL audit_string_contains_control() will return that the data has
no control characters and will then pass the string to audit_log_n_string
with the total length, not the length up to the first NULL.
audit_log_n_string() does a memcpy of the entire length and so the actual
audit record emitted may then contain a NULL and then whatever random memory
is after the NULL.

Since we want to log the entire octet stream (if we can't trust the data
to be a string we can't trust that a NULL isn't actually a part of it)
we should just consider NULL as a control character.  If the caller is
certain they want to stop at the first NULL they should be using
audit_log_untrustedstring.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea..fa38055 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1382,7 +1382,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
 int audit_string_contains_control(const char *string, size_t len)
 {
 	const unsigned char *p;
-	for (p = string; p < (const unsigned char *)string + len && *p; p++) {
+	for (p = string; p < (const unsigned char *)string + len; p++) {
 		if (*p == '"' || *p < 0x21 || *p > 0x7e)
 			return 1;
 	}
-- 
cgit v1.1


From 55ad2f8d340678397de5916b9cd960f17ebd7150 Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 19 Mar 2009 09:52:47 -0400
Subject: audit: ignore terminating NUL in AUDIT_USER_TTY messages

AUDIT_USER_TTY, like all other messages sent from user-space, is sent
NUL-terminated.  Unlike other user-space audit messages, which come only
from trusted sources, AUDIT_USER_TTY messages are processed using
audit_log_n_untrustedstring().

This patch modifies AUDIT_USER_TTY handling to ignore the trailing NUL
and use the "quoted_string" representation of the message if possible.

Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index fa38055..5560390 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 				audit_log_format(ab, " msg=");
 				size = nlmsg_len(nlh);
+				if (size > 0 &&
+				    ((unsigned char *)data)[size - 1] == '\0')
+					size--;
 				audit_log_n_untrustedstring(ab, data, size);
 			}
 			audit_set_pid(ab, pid);
-- 
cgit v1.1


From 6d208da89aabee8502debe842832ca0ab298d16d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Wed, 1 Apr 2009 15:47:27 -0400
Subject: audit: Fix possible return value truncation in audit_get_context()

The audit subsystem treats syscall return codes as type long, unfortunately
the audit_get_context() function mistakenly converts the return code to an
int type in the parameters which could cause problems on systems where the
sizeof(int) != sizeof(long).

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b344b86..e821d62 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -752,7 +752,7 @@ static void audit_set_auditable(struct audit_context *ctx)
 
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-						      int return_code)
+						      long return_code)
 {
 	struct audit_context *context = tsk->audit_context;
 
-- 
cgit v1.1


From 318b6d3d7ddbcad3d6867e630711b8a705d873d7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 13 Jan 2009 17:32:40 -0500
Subject: audit: incorrect ref counting in audit tree tag_chunk

tag_chunk has bad exit paths in which the inotify ref counting is wrong.
At the top of the function we found &old_watch using  inotify_find_watch().
inotify_find_watch takes a reference to the watch.  This is never dropped
on an error path.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545..917ab95 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	mutex_lock(&inode->inotify_mutex);
 	if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
 		mutex_unlock(&inode->inotify_mutex);
+		put_inotify_watch(&old->watch);
 		free_chunk(chunk);
 		return -ENOSPC;
 	}
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		chunk->dead = 1;
 		inotify_evict_watch(&chunk->watch);
 		mutex_unlock(&inode->inotify_mutex);
+		put_inotify_watch(&old->watch);
 		put_inotify_watch(&chunk->watch);
 		return 0;
 	}
-- 
cgit v1.1


From 679173b724631f49e537a15fa48ea2000bdc1808 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 26 Jan 2009 18:09:45 -0500
Subject: audit: audit_set_auditable defined but not used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

after 0590b9335a1c72a3f0defcc6231287f7817e07c8 audit_set_auditable() is now only
used by the audit tree code.  If CONFIG_AUDIT_TREE is unset it will be defined
but unused.  This patch simply moves the function inside a CONFIG_AUDIT_TREE
block.

cc1: warnings being treated as errors
/home/acme_unencrypted/git/linux-2.6-tip/kernel/auditsc.c:745: error: ‘audit_set_auditable’ defined but not used
make[2]: *** [kernel/auditsc.o] Error 1
make[1]: *** [kernel] Error 2
make[1]: *** Waiting for unfinished jobs....

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e821d62..aa0428e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -329,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
  */
 
 #ifdef CONFIG_AUDIT_TREE
+static void audit_set_auditable(struct audit_context *ctx)
+{
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
+}
+
 static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
 {
 	struct audit_tree_refs *p = ctx->trees;
@@ -742,14 +750,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 	rcu_read_unlock();
 }
 
-static void audit_set_auditable(struct audit_context *ctx)
-{
-	if (!ctx->prio) {
-		ctx->prio = 1;
-		ctx->current_state = AUDIT_RECORD_CONTEXT;
-	}
-}
-
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
 						      long return_code)
-- 
cgit v1.1


From def57543418a5f47debae28a0a9dea2effc11692 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 10 Mar 2009 18:00:14 -0400
Subject: Audit: remove spaces from audit_log_d_path

audit_log_d_path had spaces in the strings which would be emitted on the
error paths.  This patch simply replaces those spaces with an _ or removes
the needless spaces entirely.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c   | 4 ++--
 kernel/auditsc.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 5560390..9442c35 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1440,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
 	if (!pathname) {
-		audit_log_format(ab, "<no memory>");
+		audit_log_string(ab, "<no_memory>");
 		return;
 	}
 	p = d_path(path, pathname, PATH_MAX+11);
 	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
 		/* FIXME: can we save some information here? */
-		audit_log_format(ab, "<too long>");
+		audit_log_string(ab, "<too_long>");
 	} else
 		audit_log_untrustedstring(ab, p);
 	kfree(pathname);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aa0428e..7d6ac7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			case 0:
 				/* name was specified as a relative path and the
 				 * directory component is the cwd */
-				audit_log_d_path(ab, " name=", &context->pwd);
+				audit_log_d_path(ab, "name=", &context->pwd);
 				break;
 			default:
 				/* log the name's directory component */
-- 
cgit v1.1


From cd5f9a4c3199b090e91ea0064cb110985ba54814 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 6 Apr 2009 13:38:46 -0700
Subject: kernel/sysctl.c: avoid annoying warnings

Some of the limit constants are used only depending on some complex
configuration dependencies, yet it's not worth making the simple
variables depend on those configuration details.  Just mark them as
perhaps not being unused, and avoid the warning.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82350f8..b125e33 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,8 +97,8 @@ static int neg_one = -1;
 #endif
 
 static int zero;
-static int one = 1;
-static int two = 2;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
-- 
cgit v1.1


From 432870dab85a2f69dc417022646cb9a70acf7f94 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 6 Apr 2009 16:16:02 +0200
Subject: exit_notify: kill the wrong capable(CAP_KILL) check

The CAP_KILL check in exit_notify() looks just wrong, kill it.

Whatever logic we have to reset ->exit_signal, the malicious user
can bypass it if it execs the setuid application before exiting.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6686ed1..32cbf26 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 */
 	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id) &&
-	    !capable(CAP_KILL))
+	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-- 
cgit v1.1


From 2e45e77787c9d0720b046eb69856edf43b17e33e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 7 Apr 2009 17:12:43 +0930
Subject: Revert "module: remove the SHF_ALLOC flag on the __versions section."

This reverts commit 9cb610d8e35fe3ec95a2fe2030b02f85aeea83c1.

This was an impressively stupid patch.  Firstly, we reset the SHF_ALLOC
flag lower down in the same function, so the patch was useless.  Even
better, find_sec() ignores sections with SHF_ALLOC not set, so
it breaks CONFIG_MODVERSIONS=y with CONFIG_MODULE_FORCE_LOAD=n, which
refuses to load the module since it can't find the __versions section.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c268a77..05f014e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1952,9 +1952,6 @@ static noinline struct module *load_module(void __user *umod,
 		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
-		/* Don't keep __versions around; it's just for loading. */
-		if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
-			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
 
 	modindex = find_sec(hdr, sechdrs, secstrings,
-- 
cgit v1.1


From 301fd748e2c81e78e74edbc694a64caa7b95dda2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Apr 2009 11:12:23 -0400
Subject: tracing: remove CALLER_ADDR2 from wakeup tracer

Maneesh Soni was getting a crash when running the wakeup tracer.
We debugged it down to the recording of the function with the
CALLER_ADDR2 macro.  This is used to get the location of the caller
to schedule.

But the problem comes when schedule is called by assmebly. In the case
that Maneesh had, retint_careful would call schedule. But retint_careful
does not set up a proper frame pointer. CALLER_ADDR2 is defined as
__builtin_return_address(2). This produces the following assembly in
the wakeup tracer code.

   mov    0x0(%rbp),%rcx  <--- get the frame pointer of the caller
   mov    %r14d,%r8d
   mov    0xf2de8e(%rip),%rdi

   mov    0x8(%rcx),%rsi  <-- this is __builtin_return_address(1)
   mov    0x28(%rdi,%rax,8),%rbx

   mov    (%rcx),%rax  <-- get the frame pointer of the caller's caller
   mov    %r12,%rcx
   mov    0x8(%rax),%rdx <-- this is __builtin_return_address(2)

At the reading of 0x8(%rax) Maneesh's machine would take a fault.
The reason is that retint_careful did not set up the return address
and the content of %rax here was zero.

To verify this, I sent Maneesh a patch to create a frame pointer
in retint_careful. He ran the test again but this time he would take
the same type of fault from sysret_careful. The retint_careful was no
longer an issue, but there are other callers that still have issues.

Instead of adding frame pointers for all callers to schedule (in possibly
all archs), it is much safer to simply not use CALLER_ADDR2. This
loses out on knowing what called schedule, but the function tracer
will help there if needed.

Reported-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c5ad6b..5bc00e8f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	/*
@@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	data = wakeup_trace->data[wakeup_cpu];
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+
+	/*
+	 * We must be careful in using CALLER_ADDR2. But since wake_up
+	 * is not called by an assembly function  (where as schedule is)
+	 * it should be safe to use it here.
+	 */
 	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
-- 
cgit v1.1


From cf8e3474654f20433aab9aa35826d43b5f245008 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 30 Mar 2009 13:48:00 +0800
Subject: tracing: fix incorrect return type of ns2usecs()

Impact: fix time output bug in 32bits system

ns2usecs() returns 'long', it's incorrect.

(In i386)
...
          <idle>-0     [000]   521.442100: _spin_lock <-tick_do_update_jiffies64
          <idle>-0     [000]   521.442101: do_timer <-tick_do_update_jiffies64
          <idle>-0     [000]   521.442102: update_wall_time <-do_timer
          <idle>-0     [000]   521.442102: update_xtime_cache <-update_wall_time
....
(It always print the time less than 2200 seconds besides ...)
Because 'long' is 32bits in i386. ( (1<<31) useconds is about 2200 seconds)

...
          <idle>-0     [001] 4154502640.134759: rcu_bh_qsctr_inc <-__do_softirq
          <idle>-0     [001] 4154502640.134760: _local_bh_enable <-__do_softirq
          <idle>-0     [001] 4154502640.134761: idle_cpu <-irq_exit
...
(very large value)
Because 'long' is a signed type and it is 32bits in i386.

Changes in v2:
return 'unsigned long long' instead of 'cycle_t'

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49D05D10.4030009@cn.fujitsu.com>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 3 +--
 kernel/trace/trace.h        | 2 +-
 kernel/trace/trace_output.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0174a4..457dd8c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -147,8 +147,7 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
-long
-ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(cycle_t nsec)
 {
 	nsec += 500;
 	do_div(nsec, 1000);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3f..0d81a4a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -596,7 +596,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(cycle_t nsec);
 extern int
 trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d72b9a6..64b54a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 
 		trace_find_cmdline(entry->pid, comm);
 
-		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
 				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
-- 
cgit v1.1


From 5f0c6c03c5fee91c02c696bc9bf4c0d41392abe7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Mar 2009 14:22:10 +0100
Subject: tracing/ftrace: fix missing include string.h

Building a kernel with tracing can raise the following warning on
tip/master:

kernel/trace/trace.c:1249: error: implicit declaration of function 'vbin_printf'

We are missing an include to string.h

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238160130-7437-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 457dd8c..2230b46 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
 #include <linux/percpu.h>
 #include <linux/splice.h>
 #include <linux/kdebug.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-- 
cgit v1.1


From 8bcae09b93e7f96f700b6bb372c2b3f2b36636dc Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:24:51 +0800
Subject: ftrace: Add check of sched_stopped for probe_sched_wakeup

The wakeup tracing in sched_switch does not stop when a user
disables tracing. This is because the probe_sched_wakeup() is missing
the check to prevent the wakeup from being traced.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D1C543.3010307@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index de35f20..9117cea 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	pc = preempt_count();
 	tracing_record_cmdline(current);
 
+	if (sched_stopped)
+		return;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-- 
cgit v1.1


From b0dfa978c7a1699fb3506fbfcba0b6a5c4bd17ae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 1 Apr 2009 22:53:08 +0200
Subject: tracing/ftrace: alloc the started cpumask for the trace file

Impact: fix a crash while cat trace file

Currently we are using a cpumask to remind each cpu where a
trace occured. It lets us notice the user that a cpu just had
its first trace.

But on latest -tip we have the following crash once we cat the trace
file:

IP: [<c0270c4a>] print_trace_fmt+0x45/0xe7
*pde = 00000000
Oops: 0000 [#1] PREEMPT SMP
last sysfs file: /sys/class/net/eth0/carrier
Pid: 3897, comm: cat Not tainted (2.6.29-tip-02825-g0f22972-dirty #81)
EIP: 0060:[<c0270c4a>] EFLAGS: 00010297 CPU: 0
EIP is at print_trace_fmt+0x45/0xe7
EAX: 00000000 EBX: 00000000 ECX: c12d9e98 EDX: ccdb7010
ESI: d31f4000 EDI: 00322401 EBP: d31f3f10 ESP: d31f3efc
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process cat (pid: 3897, ti=d31f2000 task=d3b3cf20 task.ti=d31f2000)
Stack:
d31f4080 ccdb7010 d31f4000 d691fe70 ccdb7010 d31f3f24 c0270e5c d31f4000
d691fe70 d31f4000 d31f3f34 c02718e8 c12d9e98 d691fe70 d31f3f70 c02bfc33
00001000 09130000 d3b46e00 d691fe98 00000000 00000079 00000001 00000000
Call Trace:
[<c0270e5c>] ? print_trace_line+0x170/0x17c
[<c02718e8>] ? s_show+0xa7/0xbd
[<c02bfc33>] ? seq_read+0x24a/0x327
[<c02bf9e9>] ? seq_read+0x0/0x327
[<c02ab18b>] ? vfs_read+0x86/0xe1
[<c02ab289>] ? sys_read+0x40/0x65
[<c0202d8f>] ? sysenter_do_call+0x12/0x3c
Code: 00 00 00 89 45 ec f7 c7 00 20 00 00 89 55 f0 74 4e f6 86 98 10 00 00 02 74 45 8b 86 8c 10 00 00 8b 9e a8 10 00 00 e8 52 f3 ff ff <0f> a3 03 19 c0 85 c0 75 2b 8b 86 8c 10 00 00 8b 9e a8 10 00 00
EIP: [<c0270c4a>] print_trace_fmt+0x45/0xe7 SS:ESP 0068:d31f3efc
CR2: 0000000000000000
---[ end trace aa9cf38e5ebed9dd ]---

This is because we alloc the iter->started cpumask on tracing_pipe_open but
not on tracing_open.

It hadn't been noticed until now because we need to have ring buffer overruns
to activate the starting of cpu buffer detection.

Also, we need a check to not print the messagge for the first trace on the file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238619188-6109-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2230b46..fc8c7d6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 		return;
 
 	cpumask_set_cpu(iter->cpu, iter->started);
-	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+
+	/* Don't print started cpu buffer for the first entry of the trace */
+	if (iter->idx > 1)
+		trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+				iter->cpu);
 }
 
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file)
 	if (current_trace)
 		*iter->trace = *current_trace;
 
+	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+		goto fail;
+
+	cpumask_clear(iter->started);
+
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
@@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
+	free_cpumask_var(iter->started);
  fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter->trace);
@@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	seq_release(inode, file);
 	mutex_destroy(&iter->mutex);
+	free_cpumask_var(iter->started);
 	kfree(iter->trace);
 	kfree(iter);
 	return 0;
-- 
cgit v1.1


From bc2b6871c17b3aff79fb14e1a1c06c5f5a187f76 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 23 Mar 2009 11:58:31 +0530
Subject: Update /debug/tracing/README

Some of the tracers have been renamed, which was not updated in the in-kernel
run-time README file. Update it.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
LKML-Reference: <200903231158.32151.knikanth@suse.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fc8c7d6..9d28476 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2369,9 +2369,9 @@ static const char readme_msg[] =
 	"# mkdir /debug\n"
 	"# mount -t debugfs nodev /debug\n\n"
 	"# cat /debug/tracing/available_tracers\n"
-	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
 	"# cat /debug/tracing/current_tracer\n"
-	"none\n"
+	"nop\n"
 	"# echo sched_switch > /debug/tracing/current_tracer\n"
 	"# cat /debug/tracing/current_tracer\n"
 	"sched_switch\n"
-- 
cgit v1.1


From 1bbe2a83ab68e5cf8c66c372c7cb3b51910c2cfe Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 3 Apr 2009 18:24:46 +0800
Subject: ftrace: Correct a text align for event format output

If we cat debugfs/tracing/events/ftrace/bprint/format, we'll see:
name: bprint
ID: 6
format:
	field:unsigned char common_type;	offset:0;	size:1;
	field:unsigned char common_flags;	offset:1;	size:1;
	field:unsigned char common_preempt_count;	offset:2;	size:1;
	field:int common_pid;	offset:4;	size:4;
	field:int common_tgid;	offset:8;	size:4;

	field:unsigned long ip;	offset:12;	size:4;
	field:char * fmt;	offset:16;	size:4;
	field: char buf;	offset:20;	size:0;

print fmt: "%08lx (%d) fmt:%p %s"

There is an inconsistent blank before char buf.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D5E3EE.70201@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4d9952d..07a22c3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -40,7 +40,7 @@
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"		\
+	ret = trace_seq_printf(s, "\tfield:char " #item ";\t"		\
 			       "offset:%u;\tsize:0;\n",			\
 			       (unsigned int)offsetof(typeof(field), item)); \
 	if (!ret)							\
-- 
cgit v1.1


From fafd688e4c0c34da0f3de909881117d374e4c7af Mon Sep 17 00:00:00 2001
From: Peter W Morreale <pmorreale@novell.com>
Date: Mon, 6 Apr 2009 19:00:29 -0700
Subject: mm: add /proc controls for pdflush threads

Add /proc entries to give the admin the ability to control the minimum and
maximum number of pdflush threads.  This allows finer control of pdflush
on both large and small machines.

The rationale is simply one size does not fit all.  Admins on large and/or
small systems may want to tune the min/max pdflush thread count to best
suit their needs.  Right now the min/max is hardcoded to 2/8.  While
probably a fair estimate for smaller machines, large machines with large
numbers of CPUs and large numbers of filesystems/block devices may benefit
from larger numbers of threads working on different block devices.

Even if the background flushing algorithm is radically changed, it is
still likely that multiple threads will be involved and admins would still
desire finer control on the min/max other than to have to recompile the
kernel.

The patch adds '/proc/sys/vm/nr_pdflush_threads_min' and
'/proc/sys/vm/nr_pdflush_threads_max' with r/w permissions.

The minimum value for nr_pdflush_threads_min is 1 and the maximum value is
the current value of nr_pdflush_threads_max.  This minimum is required
since additional thread creation is performed in a pdflush thread itself.

The minimum value for nr_pdflush_threads_max is the current value of
nr_pdflush_threads_min and the maximum value can be 1000.

Documentation/sysctl/vm.txt is also updated.

[akpm@linux-foundation.org: fix comment, fix whitespace, use __read_mostly]
Signed-off-by: Peter W Morreale <pmorreale@novell.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b125e33..72eb1a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -1027,6 +1028,28 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_min",
+		.data		= &nr_pdflush_threads_min,
+		.maxlen		= sizeof nr_pdflush_threads_min,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &nr_pdflush_threads_max,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_max",
+		.data		= &nr_pdflush_threads_max,
+		.maxlen		= sizeof nr_pdflush_threads_max,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &nr_pdflush_threads_min,
+		.extra2		= &one_thousand,
+	},
+	{
 		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
-- 
cgit v1.1


From b918e5e60d775549478e4268155142156a95aa17 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:00:58 -0700
Subject: kprobes: cleanup aggr_kprobe related code

Currently, kprobes can disable all probes at once, but can't disable it
individually (not unregister, just disable an kprobe, because
unregistering needs to wait for scheduler synchronization).  These patches
introduce APIs for on-the-fly per-probe disabling and re-enabling by
dis-arming/re-arming its breakpoint instruction.

This patch:

Change old_p to ap in add_new_kprobe() for readability, copy flags member
in add_aggr_kprobe(), and simplify the code flow of
register_aggr_kprobe().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 60 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5016bfb..a55bfad 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -518,20 +518,20 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 }
 
 /*
-* Add the new probe to old_p->list. Fail if this is the
+* Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	if (p->break_handler) {
-		if (old_p->break_handler)
+		if (ap->break_handler)
 			return -EEXIST;
-		list_add_tail_rcu(&p->list, &old_p->list);
-		old_p->break_handler = aggr_break_handler;
+		list_add_tail_rcu(&p->list, &ap->list);
+		ap->break_handler = aggr_break_handler;
 	} else
-		list_add_rcu(&p->list, &old_p->list);
-	if (p->post_handler && !old_p->post_handler)
-		old_p->post_handler = aggr_post_handler;
+		list_add_rcu(&p->list, &ap->list);
+	if (p->post_handler && !ap->post_handler)
+		ap->post_handler = aggr_post_handler;
 	return 0;
 }
 
@@ -544,6 +544,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	copy_kprobe(p, ap);
 	flush_insn_slot(ap);
 	ap->addr = p->addr;
+	ap->flags = p->flags;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
 	/* We don't care the kprobe which has gone. */
@@ -566,44 +567,43 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
 {
 	int ret = 0;
-	struct kprobe *ap;
+	struct kprobe *ap = old_p;
 
-	if (kprobe_gone(old_p)) {
+	if (old_p->pre_handler != aggr_pre_handler) {
+		/* If old_p is not an aggr_probe, create new aggr_kprobe. */
+		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+		if (!ap)
+			return -ENOMEM;
+		add_aggr_kprobe(ap, old_p);
+	}
+
+	if (kprobe_gone(ap)) {
 		/*
 		 * Attempting to insert new probe at the same location that
 		 * had a probe in the module vaddr area which already
 		 * freed. So, the instruction slot has already been
 		 * released. We need a new slot for the new probe.
 		 */
-		ret = arch_prepare_kprobe(old_p);
+		ret = arch_prepare_kprobe(ap);
 		if (ret)
+			/*
+			 * Even if fail to allocate new slot, don't need to
+			 * free aggr_probe. It will be used next time, or
+			 * freed by unregister_kprobe.
+			 */
 			return ret;
-	}
-	if (old_p->pre_handler == aggr_pre_handler) {
-		copy_kprobe(old_p, p);
-		ret = add_new_kprobe(old_p, p);
-		ap = old_p;
-	} else {
-		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
-		if (!ap) {
-			if (kprobe_gone(old_p))
-				arch_remove_kprobe(old_p);
-			return -ENOMEM;
-		}
-		add_aggr_kprobe(ap, old_p);
-		copy_kprobe(ap, p);
-		ret = add_new_kprobe(ap, p);
-	}
-	if (kprobe_gone(old_p)) {
+		/* Clear gone flag to prevent allocating new slot again. */
+		ap->flags &= ~KPROBE_FLAG_GONE;
 		/*
 		 * If the old_p has gone, its breakpoint has been disarmed.
 		 * We have to arm it again after preparing real kprobes.
 		 */
-		ap->flags &= ~KPROBE_FLAG_GONE;
 		if (kprobe_enabled)
 			arch_arm_kprobe(ap);
 	}
-	return ret;
+
+	copy_kprobe(ap, p);
+	return add_new_kprobe(ap, p);
 }
 
 static int __kprobes in_kprobes_functions(unsigned long addr)
-- 
cgit v1.1


From 99081ab553d6a88dd6b3774af5eef94dbb8faad7 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:00:59 -0700
Subject: kprobes: move EXPORT_SYMBOL_GPL just after function definitions

Clean up positions of EXPORT_SYMBOL_GPL in kernel/kprobes.c according to
checkpatch.pl.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a55bfad..ca4c22d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -722,6 +722,7 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kprobe);
 
 /*
  * Unregister a kprobe without a scheduler synchronization.
@@ -803,11 +804,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kprobes);
 
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unregister_kprobes(&p, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_kprobe);
 
 void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 {
@@ -826,6 +829,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 		if (kps[i]->addr)
 			__unregister_kprobe_bottom(kps[i]);
 }
+EXPORT_SYMBOL_GPL(unregister_kprobes);
 
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
@@ -865,16 +869,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_jprobes);
 
 int __kprobes register_jprobe(struct jprobe *jp)
 {
 	return register_jprobes(&jp, 1);
 }
+EXPORT_SYMBOL_GPL(register_jprobe);
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
 {
 	unregister_jprobes(&jp, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_jprobe);
 
 void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 {
@@ -894,6 +901,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 			__unregister_kprobe_bottom(&jps[i]->kp);
 	}
 }
+EXPORT_SYMBOL_GPL(unregister_jprobes);
 
 #ifdef CONFIG_KRETPROBES
 /*
@@ -987,6 +995,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 		free_rp_inst(rp);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kretprobe);
 
 int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
@@ -1004,11 +1013,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kretprobes);
 
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unregister_kretprobes(&rp, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
 
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
@@ -1030,24 +1041,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 
 #else /* CONFIG_KRETPROBES */
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(register_kretprobe);
 
 int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
 
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 
 static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
@@ -1418,16 +1435,5 @@ late_initcall(debugfs_kprobe_init);
 
 module_init(init_kprobes);
 
-EXPORT_SYMBOL_GPL(register_kprobe);
-EXPORT_SYMBOL_GPL(unregister_kprobe);
-EXPORT_SYMBOL_GPL(register_kprobes);
-EXPORT_SYMBOL_GPL(unregister_kprobes);
-EXPORT_SYMBOL_GPL(register_jprobe);
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-EXPORT_SYMBOL_GPL(register_jprobes);
-EXPORT_SYMBOL_GPL(unregister_jprobes);
+/* defined in arch/.../kernel/kprobes.c */
 EXPORT_SYMBOL_GPL(jprobe_return);
-EXPORT_SYMBOL_GPL(register_kretprobe);
-EXPORT_SYMBOL_GPL(unregister_kretprobe);
-EXPORT_SYMBOL_GPL(register_kretprobes);
-EXPORT_SYMBOL_GPL(unregister_kretprobes);
-- 
cgit v1.1


From e579abeb58eb4b8d7321c6eb44dd9e2d0cbaebaa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:01 -0700
Subject: kprobes: rename kprobe_enabled to kprobes_all_disarmed

Rename kprobe_enabled to kprobes_all_disarmed and invert logic due to
avoiding naming confusion from per-probe disabling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ca4c22d..dae198b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -68,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 /* NOTE: change this value only with kprobe_mutex held */
-static bool kprobe_enabled;
+static bool kprobes_all_disarmed;
 
 static DEFINE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -598,7 +598,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 		 * If the old_p has gone, its breakpoint has been disarmed.
 		 * We have to arm it again after preparing real kprobes.
 		 */
-		if (kprobe_enabled)
+		if (!kprobes_all_disarmed)
 			arch_arm_kprobe(ap);
 	}
 
@@ -709,7 +709,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (kprobe_enabled)
+	if (!kprobes_all_disarmed)
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -751,7 +751,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -1190,8 +1190,8 @@ static int __init init_kprobes(void)
 		}
 	}
 
-	/* By default, kprobes are enabled */
-	kprobe_enabled = true;
+	/* By default, kprobes are armed */
+	kprobes_all_disarmed = false;
 
 	err = arch_init_kprobes();
 	if (!err)
@@ -1289,7 +1289,7 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
-static void __kprobes enable_all_kprobes(void)
+static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -1298,8 +1298,8 @@ static void __kprobes enable_all_kprobes(void)
 
 	mutex_lock(&kprobe_mutex);
 
-	/* If kprobes are already enabled, just return */
-	if (kprobe_enabled)
+	/* If kprobes are armed, just return */
+	if (!kprobes_all_disarmed)
 		goto already_enabled;
 
 	mutex_lock(&text_mutex);
@@ -1311,7 +1311,7 @@ static void __kprobes enable_all_kprobes(void)
 	}
 	mutex_unlock(&text_mutex);
 
-	kprobe_enabled = true;
+	kprobes_all_disarmed = false;
 	printk(KERN_INFO "Kprobes globally enabled\n");
 
 already_enabled:
@@ -1319,7 +1319,7 @@ already_enabled:
 	return;
 }
 
-static void __kprobes disable_all_kprobes(void)
+static void __kprobes disarm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -1328,11 +1328,11 @@ static void __kprobes disable_all_kprobes(void)
 
 	mutex_lock(&kprobe_mutex);
 
-	/* If kprobes are already disabled, just return */
-	if (!kprobe_enabled)
+	/* If kprobes are already disarmed, just return */
+	if (kprobes_all_disarmed)
 		goto already_disabled;
 
-	kprobe_enabled = false;
+	kprobes_all_disarmed = true;
 	printk(KERN_INFO "Kprobes globally disabled\n");
 	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -1364,7 +1364,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
 {
 	char buf[3];
 
-	if (kprobe_enabled)
+	if (!kprobes_all_disarmed)
 		buf[0] = '1';
 	else
 		buf[0] = '0';
@@ -1387,12 +1387,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	case 'y':
 	case 'Y':
 	case '1':
-		enable_all_kprobes();
+		arm_all_kprobes();
 		break;
 	case 'n':
 	case 'N':
 	case '0':
-		disable_all_kprobes();
+		disarm_all_kprobes();
 		break;
 	}
 
-- 
cgit v1.1


From de5bd88d5a5cce3cacea904d3503e5ebdb3852a2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:02 -0700
Subject: kprobes: support per-kprobe disabling

Add disable_kprobe() and enable_kprobe() to disable/enable kprobes
temporarily.

disable_kprobe() asynchronously disables probe handlers of specified
kprobe.  So, after calling it, some handlers can be called at a while.
enable_kprobe() enables specified kprobe.

aggr_pre_handler and aggr_post_handler check disabled probes.  On the
other hand aggr_break_handler and aggr_fault_handler don't check it
because these handlers will be called while executing pre or post handlers
and usually those help error handling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 141 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dae198b..a5e74dd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->pre_handler && !kprobe_gone(kp)) {
+		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
 				return 1;
@@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->post_handler && !kprobe_gone(kp)) {
+		if (kp->post_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
 			reset_kprobe_instance();
@@ -523,6 +523,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+	BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
 	if (p->break_handler) {
 		if (ap->break_handler)
 			return -EEXIST;
@@ -532,6 +533,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 		list_add_rcu(&p->list, &ap->list);
 	if (p->post_handler && !ap->post_handler)
 		ap->post_handler = aggr_post_handler;
+
+	if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+		ap->flags &= ~KPROBE_FLAG_DISABLED;
+		if (!kprobes_all_disarmed)
+			/* Arm the breakpoint again. */
+			arch_arm_kprobe(ap);
+	}
 	return 0;
 }
 
@@ -592,20 +600,36 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 			 * freed by unregister_kprobe.
 			 */
 			return ret;
-		/* Clear gone flag to prevent allocating new slot again. */
-		ap->flags &= ~KPROBE_FLAG_GONE;
+
 		/*
-		 * If the old_p has gone, its breakpoint has been disarmed.
-		 * We have to arm it again after preparing real kprobes.
+		 * Clear gone flag to prevent allocating new slot again, and
+		 * set disabled flag because it is not armed yet.
 		 */
-		if (!kprobes_all_disarmed)
-			arch_arm_kprobe(ap);
+		ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+			    | KPROBE_FLAG_DISABLED;
 	}
 
 	copy_kprobe(ap, p);
 	return add_new_kprobe(ap, p);
 }
 
+/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
+static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (!kprobe_disabled(kp))
+			/*
+			 * There is an active probe on the list.
+			 * We can't disable aggr_kprobe.
+			 */
+			return 0;
+	}
+	p->flags |= KPROBE_FLAG_DISABLED;
+	return 1;
+}
+
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	struct kprobe_blackpoint *kb;
@@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
 		return -EINVAL;
 	}
 
-	p->flags = 0;
+	/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+	p->flags &= KPROBE_FLAG_DISABLED;
+
 	/*
 	 * Check if are we probing a module.
 	 */
@@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (!kprobes_all_disarmed)
+	if (!kprobes_all_disarmed && !kprobe_disabled(p))
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -724,25 +750,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/*
- * Unregister a kprobe without a scheduler synchronization.
- */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
 	struct kprobe *old_p, *list_p;
 
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p))
-		return -EINVAL;
+		return NULL;
 
 	if (p != old_p) {
 		list_for_each_entry_rcu(list_p, &old_p->list, list)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
-				goto valid_p;
-		return -EINVAL;
+				goto valid;
+		return NULL;
 	}
-valid_p:
+valid:
+	return old_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+	struct kprobe *old_p, *list_p;
+
+	old_p = __get_valid_kprobe(p);
+	if (old_p == NULL)
+		return -EINVAL;
+
 	if (old_p == p ||
 	    (old_p->pre_handler == aggr_pre_handler &&
 	     list_is_singular(&old_p->list))) {
@@ -751,7 +789,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -769,6 +807,11 @@ valid_p:
 		}
 noclean:
 		list_del_rcu(&p->list);
+		if (!kprobe_disabled(old_p)) {
+			try_to_disable_aggr_kprobe(old_p);
+			if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+				arch_disarm_kprobe(old_p);
+		}
 	}
 	return 0;
 }
@@ -1078,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 static void __kprobes kill_kprobe(struct kprobe *p)
 {
 	struct kprobe *kp;
+
 	p->flags |= KPROBE_FLAG_GONE;
 	if (p->pre_handler == aggr_pre_handler) {
 		/*
@@ -1219,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
 	else
 		kprobe_type = "k";
 	if (sym)
-		seq_printf(pi, "%p  %s  %s+0x%x  %s %s\n", p->addr, kprobe_type,
-			sym, offset, (modname ? modname : " "),
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+			p->addr, kprobe_type, sym, offset,
+			(modname ? modname : " "),
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 	else
-		seq_printf(pi, "%p  %s  %p %s\n", p->addr, kprobe_type, p->addr,
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %p %s%s\n",
+			p->addr, kprobe_type, p->addr,
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 }
 
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1289,6 +1339,71 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_arm_kprobe(p);
+
+	p->flags &= ~KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -1306,7 +1421,7 @@ static void __kprobes arm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
-			if (!kprobe_gone(p))
+			if (!kprobe_disabled(p))
 				arch_arm_kprobe(p);
 	}
 	mutex_unlock(&text_mutex);
@@ -1338,7 +1453,7 @@ static void __kprobes disarm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
-			if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
+			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
 				arch_disarm_kprobe(p);
 		}
 	}
-- 
cgit v1.1


From 6279a751fe096a21dc7704e918d570d3ff06e769 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 27 Mar 2009 01:06:07 +0100
Subject: posix-timers: fix RLIMIT_CPU && fork()

See http://bugzilla.kernel.org/show_bug.cgi?id=12911

copy_signal() copies signal->rlim, but RLIMIT_CPU is "lost". Because
posix_cpu_timers_init_group() sets cputime_expires.prof_exp = 0 and thus
fastpath_timer_check() returns false unless we have other expired cpu timers.

Change copy_signal() to set cputime_expires.prof_exp if we have RLIMIT_CPU.
Also, set cputimer.running = 1 in that case. This is not strictly necessary,
but imho makes sense.

Reported-by: Peter Lojkin <ia6432@inbox.ru>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Peter Lojkin <ia6432@inbox.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: stable@kernel.org
LKML-Reference: <20090327000607.GA10104@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4854c2c..9b51a1b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -808,6 +808,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	sig->cputime_expires.virt_exp = cputime_zero;
 	sig->cputime_expires.sched_exp = 0;
 
+	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+		sig->cputime_expires.prof_exp =
+			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+		sig->cputimer.running = 1;
+	}
+
 	/* The timer lists. */
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -823,11 +829,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 		atomic_inc(&current->signal->live);
 		return 0;
 	}
-	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
-
-	if (sig)
-		posix_cpu_timers_init_group(sig);
 
+	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
 	tsk->signal = sig;
 	if (!sig)
 		return -ENOMEM;
@@ -865,6 +868,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 	task_unlock(current->group_leader);
 
+	posix_cpu_timers_init_group(sig);
+
 	acct_init_pacct(&sig->pacct);
 
 	tty_audit_fork(sig);
-- 
cgit v1.1


From 8f2e586567b1bad72dac7c3810fe9a2ef7117506 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 27 Mar 2009 01:06:10 +0100
Subject: posix-timers: fix RLIMIT_CPU && setitimer(CPUCLOCK_PROF)

update_rlimit_cpu() tries to optimize out set_process_cpu_timer() in case
when we already have CPUCLOCK_PROF timer which should expire first. But it
uses cputime_lt() instead of cputime_gt().

Test case:

	int main(void)
	{
		struct itimerval it = {
			.it_value = { .tv_sec = 1000 },
		};

		assert(!setitimer(ITIMER_PROF, &it, NULL));

		struct rlimit rl = {
			.rlim_cur = 1,
			.rlim_max = 1,
		};

		assert(!setrlimit(RLIMIT_CPU, &rl));

		for (;;)
			;

		return 0;
	}

Without this patch, the task is not killed as RLIMIT_CPU demands.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Peter Lojkin <ia6432@inbox.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: stable@kernel.org
LKML-Reference: <20090327000610.GA10108@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8e5d9a6..bb53185 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
 
 	cputime = secs_to_cputime(rlim_new);
 	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-	    cputime_lt(current->signal->it_prof_expires, cputime)) {
+	    cputime_gt(current->signal->it_prof_expires, cputime)) {
 		spin_lock_irq(&current->sighand->siglock);
 		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 		spin_unlock_irq(&current->sighand->siglock);
-- 
cgit v1.1


From 3a709703538c471530405556dda136fd0d82b0dc Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 7 Apr 2009 23:21:06 -0700
Subject: ptrace: some checkpatch fixes

This fixes all the checkpatch --file complaints about kernel/ptrace.c
and also removes an unused #include.  I've verified that there are no
changes to the compiled code on x86_64.

Signed-off-by: Roland McGrath <roland@redhat.com>
[ Removed the parts that just split a line  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec..64191fa0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,9 +21,7 @@
 #include <linux/audit.h>
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 /*
@@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 	list_add(&child->ptrace_entry, &new_parent->ptraced);
 	child->parent = new_parent;
 }
- 
+
 /*
  * Turn a tracing stop into a normal stop now, since with no tracer there
  * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
@@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	task_lock(task);
 	err = __ptrace_may_access(task, mode);
 	task_unlock(task);
-	return (!err ? true : false);
+	return !err;
 }
 
 int ptrace_attach(struct task_struct *task)
@@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
 		copied += retval;
 		src += retval;
 		dst += retval;
-		len -= retval;			
+		len -= retval;
 	}
 	return copied;
 }
@@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 		copied += retval;
 		src += retval;
 		dst += retval;
-		len -= retval;			
+		len -= retval;
 	}
 	return copied;
 }
@@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
 		if (unlikely(!arch_has_single_step()))
 			return -EIO;
 		user_enable_single_step(child);
-	}
-	else
+	} else {
 		user_disable_single_step(child);
+	}
 
 	child->exit_code = data;
 	wake_up_process(child);
-- 
cgit v1.1


From 3217ab97f14c5c8f9f975ed8c40c351164b0b10e Mon Sep 17 00:00:00 2001
From: Vitaliy Gusev <vgusev@openvz.org>
Date: Thu, 9 Apr 2009 09:50:35 -0600
Subject: kthread: Don't looking for a task in create_kthread() #2

Remove the unnecessary find_task_by_pid_ns(). kthread() can just
use "current" to get the same result.

Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/kthread.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84bbadd..c013bf0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
+	create->result = current;
 	complete(&create->started);
 	schedule();
 
@@ -101,9 +102,6 @@ static void create_kthread(struct kthread_create_info *create)
 	} else {
 		struct sched_param param = { .sched_priority = 0 };
 		wait_for_completion(&create->started);
-		read_lock(&tasklist_lock);
-		create->result = find_task_by_pid_ns(pid, &init_pid_ns);
-		read_unlock(&tasklist_lock);
 		/*
 		 * root may have changed our (kthreadd's) priority or CPU mask.
 		 * The kernel thread should not inherit these properties.
-- 
cgit v1.1


From 1c99315bb36b5d776210546d438ca928dc9b1f22 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 9 Apr 2009 09:50:36 -0600
Subject: kthread: move sched-realeted initialization from kthreadd context

kthreadd is the single thread which implements ths "create" request, move
sched_setscheduler/etc from create_kthread() to kthread_create() to
improve the scalability.

We should be careful with sched_setscheduler(), use _nochek helper.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Vitaliy Gusev <vgusev@openvz.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/kthread.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index c013bf0..4ebaf85 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,19 +97,10 @@ static void create_kthread(struct kthread_create_info *create)
 
 	/* We want our own signal handler (we take no signals by default). */
 	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
-	if (pid < 0) {
+	if (pid < 0)
 		create->result = ERR_PTR(pid);
-	} else {
-		struct sched_param param = { .sched_priority = 0 };
+	else
 		wait_for_completion(&create->started);
-		/*
-		 * root may have changed our (kthreadd's) priority or CPU mask.
-		 * The kernel thread should not inherit these properties.
-		 */
-		sched_setscheduler(create->result, SCHED_NORMAL, &param);
-		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed_ptr(create->result, cpu_all_mask);
-	}
 	complete(&create->done);
 }
 
@@ -152,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	wait_for_completion(&create.done);
 
 	if (!IS_ERR(create.result)) {
+		struct sched_param param = { .sched_priority = 0 };
 		va_list args;
+
 		va_start(args, namefmt);
 		vsnprintf(create.result->comm, sizeof(create.result->comm),
 			  namefmt, args);
 		va_end(args);
+		/*
+		 * root may have changed our (kthreadd's) priority or CPU mask.
+		 * The kernel thread should not inherit these properties.
+		 */
+		sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+		set_user_nice(create.result, KTHREAD_NICE_LEVEL);
+		set_cpus_allowed_ptr(create.result, cpu_all_mask);
 	}
 	return create.result;
 }
-- 
cgit v1.1


From 6b44003e5ca66a3fffeb5bc90f40ada2c4340896 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 9 Apr 2009 09:50:37 -0600
Subject: work_on_cpu(): rewrite it to create a kernel thread on demand

Impact: circular locking bugfix

The various implemetnations and proposed implemetnations of work_on_cpu()
are vulnerable to various deadlocks because they all used queues of some
form.

Unrelated pieces of kernel code thus gained dependencies wherein if one
work_on_cpu() caller holds a lock which some other work_on_cpu() callback
also takes, the kernel could rarely deadlock.

Fix this by creating a short-lived kernel thread for each work_on_cpu()
invokation.

This is not terribly fast, but the only current caller of work_on_cpu() is
pci_call_probe().

It would be nice to find some other way of doing the node-local
allocations in the PCI probe code so that we can zap work_on_cpu()
altogether.  The code there is rather nasty.  I can't think of anything
simple at this time...

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/workqueue.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6b966c..f71fb2a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -966,20 +966,20 @@ undo:
 }
 
 #ifdef CONFIG_SMP
-static struct workqueue_struct *work_on_cpu_wq __read_mostly;
 
 struct work_for_cpu {
-	struct work_struct work;
+	struct completion completion;
 	long (*fn)(void *);
 	void *arg;
 	long ret;
 };
 
-static void do_work_for_cpu(struct work_struct *w)
+static int do_work_for_cpu(void *_wfc)
 {
-	struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
-
+	struct work_for_cpu *wfc = _wfc;
 	wfc->ret = wfc->fn(wfc->arg);
+	complete(&wfc->completion);
+	return 0;
 }
 
 /**
@@ -990,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w)
  *
  * This will return the value @fn returns.
  * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
  */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-	struct work_for_cpu wfc;
-
-	INIT_WORK(&wfc.work, do_work_for_cpu);
-	wfc.fn = fn;
-	wfc.arg = arg;
-	queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
-	flush_work(&wfc.work);
-
+	struct task_struct *sub_thread;
+	struct work_for_cpu wfc = {
+		.completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
+		.fn = fn,
+		.arg = arg,
+	};
+
+	sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+	if (IS_ERR(sub_thread))
+		return PTR_ERR(sub_thread);
+	kthread_bind(sub_thread, cpu);
+	wake_up_process(sub_thread);
+	wait_for_completion(&wfc.completion);
 	return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -1016,8 +1022,4 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
-#ifdef CONFIG_SMP
-	work_on_cpu_wq = create_workqueue("work_on_cpu");
-	BUG_ON(!work_on_cpu_wq);
-#endif
 }
-- 
cgit v1.1


From 47788c58e66c050982241d9a05eb690daceb05a9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 8 Apr 2009 20:40:59 +0200
Subject: tracing/syscalls: use a dedicated file header

Impact: fix build warnings and possibe compat misbehavior on IA64

Building a kernel on ia64 might trigger these ugly build warnings:

CC      arch/ia64/ia32/sys_ia32.o
In file included from arch/ia64/ia32/sys_ia32.c:55:
arch/ia64/ia32/ia32priv.h:290:1: warning: "elf_check_arch" redefined
In file included from include/linux/elf.h:7,
                 from include/linux/module.h:14,
                 from include/linux/ftrace.h:8,
                 from include/linux/syscalls.h:68,
                 from arch/ia64/ia32/sys_ia32.c:18:
arch/ia64/include/asm/elf.h:19:1: warning: this is the location of the previous definition
[...]

sys_ia32.c includes linux/syscalls.h which in turn includes linux/ftrace.h
to import the syscalls tracing prototypes.

But including ftrace.h can pull too much things for a low level file,
especially on ia64 where the ia32 private headers conflict with higher
level headers.

Now we isolate the syscall tracing headers in their own lightweight file.

Reported-by: Tony Luck <tony.luck@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jason Baron <jbaron@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Jiaying Zhang <jiayingz@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Michael Davidson <md@google.com>
LKML-Reference: <20090408184058.GB6017@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af2..5e57964 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
+#include <trace/syscall.h>
 #include <linux/kernel.h>
-#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
-- 
cgit v1.1


From 9eb85125ce218a8b8d9a7c982510388e227adbec Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 9 Apr 2009 11:19:40 +0800
Subject: blktrace: pass the right pointer to kfree()

Impact: fix kfree crash with non-standard act_mask string

If passing a string with leading white spaces to strstrip(),
the returned ptr != the original ptr.

This bug was introduced by me.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49DD694C.8020902@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b32ff44..921ef5d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1377,12 +1377,12 @@ static int blk_trace_str2mask(const char *str)
 {
 	int i;
 	int mask = 0;
-	char *s, *token;
+	char *buf, *s, *token;
 
-	s = kstrdup(str, GFP_KERNEL);
-	if (s == NULL)
+	buf = kstrdup(str, GFP_KERNEL);
+	if (buf == NULL)
 		return -ENOMEM;
-	s = strstrip(s);
+	s = strstrip(buf);
 
 	while (1) {
 		token = strsep(&s, ",");
@@ -1403,7 +1403,7 @@ static int blk_trace_str2mask(const char *str)
 			break;
 		}
 	}
-	kfree(s);
+	kfree(buf);
 
 	return mask;
 }
-- 
cgit v1.1


From 36cd3c9f925b9307236505ae7ad1ad7ac4d4357c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 9 Apr 2009 18:48:34 +0200
Subject: mutex: have non-spinning mutexes on s390 by default

Impact: performance regression fix for s390

The adaptive spinning mutexes will not always do what one would expect on
virtualized architectures like s390. Especially the cpu_relax() loop in
mutex_spin_on_owner might hurt if the mutex holding cpu has been scheduled
away by the hypervisor.

We would end up in a cpu_relax() loop when there is no chance that the
state of the mutex changes until the target cpu has been scheduled again by
the hypervisor.

For that reason we should change the default behaviour to no-spin on s390.

We do have an instruction which allows to yield the current cpu in favour of
a different target cpu. Also we have an instruction which allows us to figure
out if the target cpu is physically backed.

However we need to do some performance tests until we can come up with
a solution that will do the right thing on s390.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
LKML-Reference: <20090409184834.7a0df7b2@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781..507cf2b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	preempt_disable();
 	mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
+    !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
 	/*
 	 * Optimistic spinning.
 	 *
-- 
cgit v1.1


From d1e7e02f30be672c6f6ee40908be83877a0d49d1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 15:16:56 +0800
Subject: tracing: disable seeking for trace_pipe_raw

Impact: disable pread()

We set tracing_buffers_fops.llseek to no_llseek,
but we can still perform pread() to read this file.

That is not expected.

This fix uses nonseekable_open() to disable it.

tracing_buffers_fops.llseek is still set to no_llseek,
it mark this file is a "non-seekable device" and is used by
sys_splice(). See also do_splice() or manual of splice(2):

ERRORS
       EINVAL Target file system doesn't support  splicing;
              neither  of the descriptors refers to a pipe;
              or offset given for non-seekable device.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49D46668.8030806@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d28476..24b0168 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3285,7 +3285,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = info;
 
-	return 0;
+	return nonseekable_open(inode, filp);
 
  out:
 	kfree(info);
-- 
cgit v1.1


From ddd538f3e6a1a4bec2f6942f83a753263e6577b4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 15:16:59 +0800
Subject: tracing: allocate page when needed

Impact: Cleanup

Sometimes, we open trace_pipe_raw, but we don't read(2) it,
we just splice(2) it, thus, the page is not used.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49D4666B.4010608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 24b0168..8e189ff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3277,19 +3277,13 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 
 	info->tr	= &global_trace;
 	info->cpu	= cpu;
-	info->spare	= ring_buffer_alloc_read_page(info->tr->buffer);
+	info->spare	= NULL;
 	/* Force reading ring buffer for first read */
 	info->read	= (unsigned int)-1;
-	if (!info->spare)
-		goto out;
 
 	filp->private_data = info;
 
 	return nonseekable_open(inode, filp);
-
- out:
-	kfree(info);
-	return -ENOMEM;
 }
 
 static ssize_t
@@ -3304,6 +3298,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (!count)
 		return 0;
 
+	if (!info->spare)
+		info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+	if (!info->spare)
+		return -ENOMEM;
+
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
 		goto read;
@@ -3342,7 +3341,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 {
 	struct ftrace_buffer_info *info = file->private_data;
 
-	ring_buffer_free_read_page(info->tr->buffer, info->spare);
+	if (info->spare)
+		ring_buffer_free_read_page(info->tr->buffer, info->spare);
 	kfree(info);
 
 	return 0;
-- 
cgit v1.1


From c7625a555f55d7ae49236cde551786c88f5a5ce1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 15:17:04 +0800
Subject: tracing: update file->f_pos when splice(2) it

Impact: Cleanup

These two lines:

	if (unlikely(*ppos))
		return -ESPIPE;

in tracing_buffers_splice_read() are not needed, VFS layer
has disabled seek(2).

We remove these two lines, and then we can update file->f_pos.

And tracing_buffers_read() updates file->f_pos, this fix
make tracing_buffers_splice_read() updates file->f_pos too.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49D46670.4010503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8e189ff..9462976 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3428,13 +3428,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	int size, i;
 	size_t ret;
 
-	/*
-	 * We can't seek on a buffer input
-	 */
-	if (unlikely(*ppos))
-		return -ESPIPE;
-
-
 	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
 		struct page *page;
 		int r;
@@ -3474,6 +3467,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.partial[i].offset = 0;
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
+		*ppos += size;
 	}
 
 	spd.nr_pages = i;
-- 
cgit v1.1


From 93cfb3c9fd83d877a8f1ffad9ff862b617b32828 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 15:17:08 +0800
Subject: tracing: fix splice return too large

I got these from strace:

 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 12288
 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 12288
 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 12288
 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 16384
 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 8192
 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 8192
 splice(0x3, 0, 0x5, 0, 0x1000, 0x1) = 8192

I wanted to splice_read 4096 bytes, but it returns 8192 or larger.

It is because the return value of tracing_buffers_splice_read()
does not include "zero out any left over data" bytes.

But tracing_buffers_read() includes these bytes, we make them
consistent.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49D46674.9030804@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9462976..1ce5dc6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3428,7 +3428,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	int size, i;
 	size_t ret;
 
-	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+	if (*ppos & (PAGE_SIZE - 1)) {
+		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
+		return -EINVAL;
+	}
+
+	if (len & (PAGE_SIZE - 1)) {
+		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
+		if (len < PAGE_SIZE)
+			return -EINVAL;
+		len &= PAGE_MASK;
+	}
+
+	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3467,7 +3479,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.partial[i].offset = 0;
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
-		*ppos += size;
+		*ppos += PAGE_SIZE;
 	}
 
 	spd.nr_pages = i;
-- 
cgit v1.1


From 4d1f4372dbea068ba4ee3d98231133a4a4ee15bd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 08:48:36 +0800
Subject: tracing: fix document references

When moving documents to Documentation/trace/, I forgot to
grep Kconfig to find out those references.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Pekka Paalanen <pq@iki.fi>
Cc: eduard.munteanu@linux360.ro
LKML-Reference: <49DE97EF.7080208@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2246141..417d198 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -312,7 +312,7 @@ config KMEMTRACE
 	  and profile kernel code.
 
 	  This requires an userspace application to use. See
-	  Documentation/vm/kmemtrace.txt for more information.
+	  Documentation/trace/kmemtrace.txt for more information.
 
 	  Saying Y will make the kernel somewhat larger and slower. However,
 	  if you disable kmemtrace at run-time or boot-time, the performance
@@ -403,7 +403,7 @@ config MMIOTRACE
 	  implementation and works via page faults. Tracing is disabled by
 	  default and can be enabled at run-time.
 
-	  See Documentation/tracers/mmiotrace.txt.
+	  See Documentation/trace/mmiotrace.txt.
 	  If you are not helping to develop drivers, say N.
 
 config MMIOTRACE_TEST
-- 
cgit v1.1


From 0462b5664b2bda5a18fef7efb5bb32ce36590c1a Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 8 Apr 2009 17:00:13 +0800
Subject: ftrace: Output REC->var instead of __entry->var for trace format

print fmt: "irq=%d return=%s", __entry->irq, __entry->ret ? \"handled\" : \"unhandled\"

"__entry" should be convert to "REC" by __stringify() macro.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49DC679D.2090901@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_2.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7..d363c66 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 		return 0;
 
 #undef __entry
-#define __entry "REC"
+#define __entry REC
 
 #undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
 
 #undef TP_fast_assign
 #define TP_fast_assign(args...) args
-- 
cgit v1.1


From d6de2c80e9d758d2e36c21699117db6178c0f517 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 10 Apr 2009 12:17:41 -0700
Subject: async: Fix module loading async-work regression

Several drivers use asynchronous work to do device discovery, and we
synchronize with them in the compiled-in case before we actually try to
mount root filesystems etc.

However, when compiled as modules, that synchronization is missing - the
module loading completes, but the driver hasn't actually finished
probing for devices, and that means that any user mode that expects to
use the devices after the 'insmod' is now potentially broken.

We already saw one case of a similar issue in the ACPI battery code,
where the kernel itself expected the module to be all done, and unmapped
the init memory - but the async device discovery was still running.
That got hacked around by just removing the "__init" (see commit
5d38258ec026921a7b266f4047ebeaa75db358e5 "ACPI battery: fix async boot
oops"), but the real fix is to just make the module loading wait for all
async work to be completed.

It will slow down module loading, but since common devices should be
built in anyway, and since the bug is really annoying and hard to handle
from user space (and caused several S3 resume regressions), the simple
fix to wait is the right one.

This fixes at least

	http://bugzilla.kernel.org/show_bug.cgi?id=13063

but probably a few other bugzilla entries too (12936, for example), and
is confirmed to fix Rafael's storage driver breakage after resume bug
report (no bugzilla entry).

We should also be able to now revert that ACPI battery fix.

Reported-and-tested-by: Rafael J. Wysocki <rjw@suse.com>
Tested-by: Heinz Diehl <htd@fancy-poultry.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 05f014e..e797812 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2388,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_LIVE, mod);
 
+	/* We need to finish all async code before the module init sequence is done */
+	async_synchronize_full();
+
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
-- 
cgit v1.1


From 8433a40eb7f2c4883ad57f9900f63e4d59240eb7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 11 Apr 2009 15:52:18 +0800
Subject: tracing/filters: NIL-terminate user input filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure messages from user space are NIL-terminated strings,
otherwise we could dump random memory while reading filter file.

Try this:
 # echo 'parent_comm ==' > events/sched/sched_process_fork/filter
 # cat events/sched/sched_process_fork/filter
 parent_comm == �

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E04C32.6060508@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d2..054bc18 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
+	buf[cnt] = '\0';
 
 	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 	if (!pred)
@@ -569,6 +570,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
+	buf[cnt] = '\0';
 
 	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 	if (!pred)
-- 
cgit v1.1


From bcabd91c271e50eebc0cb9220ac92700332b452e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 11 Apr 2009 15:52:35 +0800
Subject: tracing/filters: fix NULL pointer dereference

Try this, and you'll see NULL pointer dereference bug:

  # echo -n 'parent_comm ==' > sched/sched_process_fork/filter

Because we passed NULL ptr to simple_strtoull().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E04C43.1050504@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be41..9d2162f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -410,6 +410,11 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		}
 	}
 
+	if (!val_str) {
+		pred->field_name = NULL;
+		return -EINVAL;
+	}
+
 	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 	if (!pred->field_name)
 		return -ENOMEM;
-- 
cgit v1.1


From a3e0ab050774117d4a6173087c8bf3888662a83f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 11 Apr 2009 15:52:51 +0800
Subject: tracing/filters: allow user input integer to be oct or hex

Before patch:

 # echo 'parent_pid == 0x10' > events/sched/sched_process_fork/filter
 # cat sched/sched_process_fork/filter
 parent_pid == 0

After patch:

 # cat sched/sched_process_fork/filter
 parent_pid == 16

Also check the input more strictly.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E04C53.4010600@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9d2162f..49b3ef5 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -419,12 +419,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 	if (!pred->field_name)
 		return -ENOMEM;
 
-	pred->val = simple_strtoull(val_str, &tmp, 10);
+	pred->val = simple_strtoull(val_str, &tmp, 0);
 	if (tmp == val_str) {
 		pred->str_val = kstrdup(val_str, GFP_KERNEL);
 		if (!pred->str_val)
 			return -ENOMEM;
-	}
+	} else if (*tmp != '\0')
+		return -EINVAL;
 
 	return 0;
 }
-- 
cgit v1.1


From 44e9c8b7adc52079f0535f9de0c2c2477831389b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 11 Apr 2009 15:55:28 +0800
Subject: tracing/filters: return proper error code when writing filter file

- propagate return value of filter_add_pred() to the user

- return -ENOSPC but not -ENOMEM or -EINVAL when the filter array
  is full

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E04CF0.3010105@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c        | 10 ++++++----
 kernel/trace/trace_events_filter.c |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 054bc18..576f4fa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -521,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return cnt;
 	}
 
-	if (filter_add_pred(call, pred)) {
+	err = filter_add_pred(call, pred);
+	if (err < 0) {
 		filter_free_pred(pred);
-		return -EINVAL;
+		return err;
 	}
 
 	*ppos += cnt;
@@ -588,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return cnt;
 	}
 
-	if (filter_add_subsystem_pred(system, pred)) {
+	err = filter_add_subsystem_pred(system, pred);
+	if (err < 0) {
 		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
-		return -EINVAL;
+		return err;
 	}
 
 	*ppos += cnt;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 49b3ef5..e03cbf1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 		}
 	}
 
-	return -ENOMEM;
+	return -ENOSPC;
 }
 
 static int is_string_field(const char *type)
@@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	}
 
 	if (i == MAX_FILTER_PRED)
-		return -EINVAL;
+		return -ENOSPC;
 
 	events_for_each(call) {
 		int err;
-- 
cgit v1.1


From 9eeba6138cefc0435695463ddadb0d95e0a6bcd2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 11 Apr 2009 03:17:17 +0200
Subject: lockdep: warn about lockdep disabling after kernel taint

Impact: provide useful missing info for developers

Kernel taint can occur in several situations such as warnings,
load of prorietary or staging modules, bad page, etc...

But when such taint happens, a developer might still be working on
the kernel, expecting that lockdep is still enabled. But a taint
disables lockdep without ever warning about it.
Such a kernel behaviour doesn't really help for kernel development.

This patch adds this missing warning.

Since the taint is done most of the time after the main message that
explain the real source issue, it seems safe to warn about it inside
add_taint() so that it appears at last, without hurting the main
information.

v2: Use a generic helper to disable lockdep instead of an
    open coded xchg().

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1239412638-6739-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5b..940ca14 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,14 @@ unsigned long get_taint(void)
 
 void add_taint(unsigned flag)
 {
-	/* can't trust the integrity of the kernel anymore: */
-	debug_locks = 0;
+	/*
+	 * Can't trust the integrity of the kernel anymore.
+	 * We don't call directly debug_locks_off() because the issue
+	 * is not necessarily serious enough to set oops_in_progress to 1
+	 */
+	if (__debug_locks_off())
+		printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
+
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
-- 
cgit v1.1


From 574bbe782057fdf0490dc7dec906a2dc26363e20 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 11 Apr 2009 03:17:18 +0200
Subject: lockdep: continue lock debugging despite some taints

Impact: broaden lockdep checks

Lockdep is disabled after any kernel taints. This might be convenient
to ignore bad locking issues which sources come from outside the kernel
tree. Nevertheless, it might be a frustrating experience for the
staging developers or those who experience a warning but are focused
on another things that require lockdep.

The v2 of this patch simply don't disable anymore lockdep in case
of TAINT_CRAP and TAINT_WARN events.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: LTP <ltp-list@lists.sourceforge.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg KH <gregkh@suse.de>
LKML-Reference: <1239412638-6739-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 940ca14..934fb37 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -217,8 +217,10 @@ void add_taint(unsigned flag)
 	 * Can't trust the integrity of the kernel anymore.
 	 * We don't call directly debug_locks_off() because the issue
 	 * is not necessarily serious enough to set oops_in_progress to 1
+	 * Also we want to keep up lockdep for staging development and
+	 * post-warning case.
 	 */
-	if (__debug_locks_off())
+	if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
 		printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
 
 	set_bit(flag, &tainted_mask);
-- 
cgit v1.1


From c751085943362143f84346d274e0011419c84202 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 12 Apr 2009 20:06:56 +0200
Subject: PM/Hibernate: Wait for SCSI devices scan to complete during resume

There is a race between resume from hibernation and the asynchronous
scanning of SCSI devices and to prevent it from happening we need to
call scsi_complete_async_scans() during resume from hibernation.

In addition, if the resume from hibernation is userland-driven, it's
better to wait for all device probes in the kernel to complete before
attempting to open the resume device.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 8 ++++++++
 kernel/power/user.c | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5f21ab2..0854770 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
 
 #include "power.h"
@@ -645,6 +646,13 @@ static int software_resume(void)
 		return 0;
 
 	/*
+	 * We can't depend on SCSI devices being available after loading one of
+	 * their modules if scsi_complete_async_scans() is not called and the
+	 * resume device usually is a SCSI one.
+	 */
+	scsi_complete_async_scans();
+
+	/*
 	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
 	 * is configured into the kernel. Since the regular hibernate
 	 * trigger path is via sysfs which takes a buffer mutex before
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6c85359..ed97375 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,6 +24,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/smp_lock.h>
+#include <scsi/scsi_scan.h>
 
 #include <asm/uaccess.h>
 
@@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	filp->private_data = data;
 	memset(&data->handle, 0, sizeof(struct snapshot_handle));
 	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+		/* Hibernating.  The image device should be accessible. */
 		data->swap = swsusp_resume_device ?
 			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
 		data->mode = O_RDONLY;
@@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		if (error)
 			pm_notifier_call_chain(PM_POST_HIBERNATION);
 	} else {
+		/*
+		 * Resuming.  We may need to wait for the image device to
+		 * appear.
+		 */
+		wait_for_device_probe();
+		scsi_complete_async_scans();
+
 		data->swap = -1;
 		data->mode = O_WRONLY;
 		error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-- 
cgit v1.1


From a3d03ecaf9fe722bf96e4ef4a2f5e42ef652ddeb Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 13 Apr 2009 15:23:53 +0800
Subject: tracing: Fix power tracer header

Before patch:
  # tracer: power
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
  #              | |       |          |         |
  [  676.875865889] CSTATE: Going to C1 on cpu 0 for 0.005911463
  [  676.882938805] CSTATE: Going to C1 on cpu 0 for 0.104796532
  ...

After patch:
  # tracer: power
  #
  #   TIMESTAMP      STATE  EVENT
  #       |            |      |
  [  676.875865889] CSTATE: Going to C1 on cpu 0 for 0.005911463
  [  676.882938805] CSTATE: Going to C1 on cpu 0 for 0.104796532
  ...

v2: Use seq_puts instead of seq_printf

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49E2E889.5000903@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_power.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bae791e..1184397 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -186,6 +186,12 @@ static enum print_line_t power_print_line(struct trace_iterator *iter)
 	return TRACE_TYPE_UNHANDLED;
 }
 
+static void power_print_header(struct seq_file *s)
+{
+	seq_puts(s, "#   TIMESTAMP      STATE  EVENT\n");
+	seq_puts(s, "#       |            |      |\n");
+}
+
 static struct tracer power_tracer __read_mostly =
 {
 	.name		= "power",
@@ -194,6 +200,7 @@ static struct tracer power_tracer __read_mostly =
 	.stop		= stop_power_trace,
 	.reset		= power_trace_reset,
 	.print_line	= power_print_line,
+	.print_header	= power_print_header,
 };
 
 static int init_power_trace(void)
-- 
cgit v1.1


From 4be6f6bb66111c9468733a4ed9cad10dc3a762c0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Apr 2009 14:39:33 -0700
Subject: mm: move the scan_unevictable_pages sysctl to the vm table

vm knobs should go in the vm table.  Probably too late for
randomize_va_space though.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4286b62..e3d2c7d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -902,16 +902,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "scan_unevictable_pages",
-		.data		= &scan_unevictable_pages,
-		.maxlen		= sizeof(scan_unevictable_pages),
-		.mode		= 0644,
-		.proc_handler	= &scan_unevictable_handler,
-	},
-#endif
 #ifdef CONFIG_SLOW_WORK
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -1302,6 +1292,16 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "scan_unevictable_pages",
+		.data		= &scan_unevictable_pages,
+		.maxlen		= sizeof(scan_unevictable_pages),
+		.mode		= 0644,
+		.proc_handler	= &scan_unevictable_handler,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.1


From f1671f6d783a2385d32e11f456cbe32f0e4b4b49 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 13 Apr 2009 14:40:03 -0700
Subject: ptrace: fix exit_ptrace() vs ptrace_traceme() race

Pointed out by Roland.  The bug was recently introduced by me in
"forget_original_parent: split out the un-ptrace part", commit
39c626ae47c469abdfd30c6e42eff884931380d6.

Since that patch we have a window after exit_ptrace() drops tasklist and
before forget_original_parent() takes it again.  In this window the child
can do ptrace(PTRACE_TRACEME) and nobody can untrace this child after
that.

Change ptrace_traceme() to not attach to the exiting ->real_parent.  We
don't report the error in this case, we pretend we attach right before
->real_parent calls exit_ptrace() which should untrace us anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 64191fa0..dfcd83c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -604,10 +604,11 @@ repeat:
 		ret = security_ptrace_traceme(current->parent);
 
 		/*
-		 * Set the ptrace bit in the process ptrace flags.
-		 * Then link us on our parent's ptraced list.
+		 * Check PF_EXITING to ensure ->real_parent has not passed
+		 * exit_ptrace(). Otherwise we don't report the error but
+		 * pretend ->real_parent untraces us right after return.
 		 */
-		if (!ret) {
+		if (!ret && !(current->real_parent->flags & PF_EXITING)) {
 			current->ptrace |= PT_PTRACED;
 			__ptrace_link(current, current->real_parent);
 		}
-- 
cgit v1.1


From 3d26dcf7679c5cc6c9f3b95ffdb2152fba2b7fae Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 13 Apr 2009 14:40:08 -0700
Subject: kernel/sys.c: clean up sys_shutdown exit path

Impact: cleanup, fix

Clean up sys_shutdown() exit path.  Factor out common code.  Return
correct error code instead of always 0 on failure.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 51dbb55..e7998cf 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -360,6 +360,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		void __user *, arg)
 {
 	char buffer[256];
+	int ret = 0;
 
 	/* We only trust the superuser with rebooting the system. */
 	if (!capable(CAP_SYS_BOOT))
@@ -397,7 +398,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		kernel_halt();
 		unlock_kernel();
 		do_exit(0);
-		break;
+		panic("cannot halt");
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		kernel_power_off();
@@ -417,29 +418,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 
 #ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-		{
-			int ret;
-			ret = kernel_kexec();
-			unlock_kernel();
-			return ret;
-		}
+		ret = kernel_kexec();
+		break;
 #endif
 
 #ifdef CONFIG_HIBERNATION
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
-		{
-			int ret = hibernate();
-			unlock_kernel();
-			return ret;
-		}
+		ret = hibernate();
+		break;
 #endif
 
 	default:
-		unlock_kernel();
-		return -EINVAL;
+		ret = -EINVAL;
+		break;
 	}
 	unlock_kernel();
-	return 0;
+	return ret;
 }
 
 static void deferred_cad(struct work_struct *dummy)
-- 
cgit v1.1


From 132380a06b24704fd6c9be55c44d4ef3972cead2 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 14:18:25 +0800
Subject: tracing, sched: mark get_parent_ip() notrace

Impact: remove overly redundant tracing entries

When tracer is "function" or "function_graph", way too much
"get_parent_ip" entries are recorded in ring_buffer.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49D458B1.5000703@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5724508..e90e70e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4846,7 +4846,7 @@ void scheduler_tick(void)
 #endif
 }
 
-unsigned long get_parent_ip(unsigned long addr)
+notrace unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
-- 
cgit v1.1


From 557055bebe9212dfa6b9f5df811dfd0dac77ec55 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 13 Apr 2009 16:02:34 +0800
Subject: tracing: Fix branch tracer header

Before patch:

  # tracer: branch
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
  #              | |       |          |         |
             <...>-2981  [000] 24008.872738: [  ok  ] trace_irq_handler_exit:irq_event_types.h:41
             <...>-2981  [000] 24008.872742: [  ok  ] note_interrupt:spurious.c:229
  ...

After patch:

  # tracer: branch
  #
  #           TASK-PID    CPU#    TIMESTAMP  CORRECT  FUNC:FILE:LINE
  #              | |       |          |         |       |
             <...>-2985  [000] 26329.142970: [  ok  ] slab_free:slub.c:1776
             <...>-2985  [000] 26329.142972: [  ok  ] trace_kmem_cache_free:kmem_event_types.h:191
  ...

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49E2F19A.3040006@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ad8c22e..8333715 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -155,6 +155,13 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 	return TRACE_TYPE_HANDLED;
 }
 
+static void branch_print_header(struct seq_file *s)
+{
+	seq_puts(s, "#           TASK-PID    CPU#    TIMESTAMP  CORRECT"
+		"  FUNC:FILE:LINE\n");
+	seq_puts(s, "#              | |       |          |         |   "
+		"    |\n");
+}
 
 static struct trace_event trace_branch_event = {
 	.type		= TRACE_BRANCH,
@@ -169,6 +176,7 @@ static struct tracer branch_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_branch,
 #endif /* CONFIG_FTRACE_SELFTEST */
+	.print_header	= branch_print_header,
 };
 
 __init static int init_branch_tracer(void)
-- 
cgit v1.1


From ef631b0ca01655d24e9ca7e199262c4a46416a26 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:16 -0700
Subject: rcu: Make hierarchical RCU less IPI-happy

This patch fixes a hierarchical-RCU performance bug located by Anton
Blanchard.  The problem stems from a misguided attempt to provide a
work-around for jiffies-counter failure.  This work-around uses a per-CPU
n_rcu_pending counter, which is incremented on each call to rcu_pending(),
which in turn is called from each scheduling-clock interrupt.  Each CPU
then treats this counter as a surrogate for the jiffies counter, so
that if the jiffies counter fails to advance, the per-CPU n_rcu_pending
counter will cause RCU to invoke force_quiescent_state(), which in turn
will (among other things) send resched IPIs to CPUs that have thus far
failed to pass through an RCU quiescent state.

Unfortunately, each CPU resets only its own counter after sending a
batch of IPIs.  This means that the other CPUs will also (needlessly)
send -another- round of IPIs, for a full N-squared set of IPIs in the
worst case every three scheduler-clock ticks until the grace period
finally ends.  It is not reasonable for a given CPU to reset each and
every n_rcu_pending for all the other CPUs, so this patch instead simply
disables the jiffies-counter "training wheels", thus eliminating the
excessive IPIs.

Note that the jiffies-counter IPIs do not have this problem due to
the fact that the jiffies counter is global, so that the CPU sending
the IPIs can easily reset things, thus preventing the other CPUs from
sending redundant IPIs.

Note also that the n_rcu_pending counter remains, as it will continue to
be used for tracing.  It may also see use to update the jiffies counter,
should an appropriate kick-the-jiffies-counter API appear.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <12396834793575-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c       | 19 ++++---------------
 kernel/rcutree_trace.c | 14 +++++---------
 2 files changed, 9 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7f32669..d2a372f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -530,8 +530,6 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 	rdp->qs_pending = 1;
 	rdp->passed_quiesc = 0;
 	rdp->gpnum = rsp->gpnum;
-	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-				      RCU_JIFFIES_TILL_FORCE_QS;
 }
 
 /*
@@ -578,8 +576,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	rsp->gpnum++;
 	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-				      RCU_JIFFIES_TILL_FORCE_QS;
 	record_gp_stall_check_time(rsp);
 	dyntick_record_completed(rsp, rsp->completed - 1);
 	note_new_gpnum(rsp, rdp);
@@ -1055,7 +1051,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
 	unsigned long flags;
 	long lastcomp;
-	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	u8 signaled;
 
@@ -1066,16 +1061,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		return;	/* Someone else is already on the job. */
 	}
 	if (relaxed &&
-	    (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
-	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+	    (long)(rsp->jiffies_force_qs - jiffies) >= 0)
 		goto unlock_ret; /* no emergency and done recently. */
 	rsp->n_force_qs++;
 	spin_lock(&rnp->lock);
 	lastcomp = rsp->completed;
 	signaled = rsp->signaled;
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-				      RCU_JIFFIES_TILL_FORCE_QS;
 	if (lastcomp == rsp->gpnum) {
 		rsp->n_force_qs_ngp++;
 		spin_unlock(&rnp->lock);
@@ -1144,8 +1136,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * If an RCU GP has gone long enough, go check for dyntick
 	 * idle CPUs and, if needed, send resched IPIs.
 	 */
-	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
-	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
 		force_quiescent_state(rsp, 1);
 
 	/*
@@ -1230,8 +1221,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	if (unlikely(++rdp->qlen > qhimark)) {
 		rdp->blimit = LONG_MAX;
 		force_quiescent_state(rsp, 0);
-	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
-		   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
 }
@@ -1290,8 +1280,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
-	     (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
 		return 1;
 
 	/* nothing to do */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4ee954f..4b1875b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -49,14 +49,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->completed, rdp->gpnum,
 		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-		   rdp->qs_pending,
-		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
-		   (int)(rdp->n_rcu_pending & 0xffff));
+		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
 		   rdp->dynticks->dynticks,
@@ -102,14 +100,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
 		   rdp->completed, rdp->gpnum,
 		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-		   rdp->qs_pending,
-		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
-		   rdp->n_rcu_pending);
+		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, ",%d,%d,%d,%lu",
 		   rdp->dynticks->dynticks,
@@ -123,7 +119,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-- 
cgit v1.1


From 6ec3cfeca04622e3d80c9270191cd7f5f88214af Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Mon, 13 Apr 2009 15:20:58 -0700
Subject: x86, irq: Remove IRQ_DISABLED check in process context IRQ move

As discussed in the thread here:

  http://marc.info/?l=linux-kernel&m=123964468521142&w=2

Eric W. Biederman observed:

> It looks like some additional bugs have slipped in since last I looked.
>
> set_irq_affinity does this:
> ifdef CONFIG_GENERIC_PENDING_IRQ
>        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
>                cpumask_copy(desc->affinity, cpumask);
>                desc->chip->set_affinity(irq, cpumask);
>        } else {
>                desc->status |= IRQ_MOVE_PENDING;
>                cpumask_copy(desc->pending_mask, cpumask);
>        }
> #else
>
> That IRQ_DISABLED case is a software state and as such it has nothing to
> do with how safe it is to move an irq in process context.

[...]

>
> The only reason we migrate MSIs in interrupt context today is that there
> wasn't infrastructure for support migration both in interrupt context
> and outside of it.

Yes. The idea here was to force the MSI migration to happen in process
context. One of the patches in the series did

        disable_irq(dev->irq);
        irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);

with the above patch adding irq/manage code check for interrupt disabled
and moving the interrupt in process context.

IIRC, there was no IRQ_MOVE_PCNTXT when we were developing this HPET
code and we ended up having this ugly hack. IRQ_MOVE_PCNTXT was there
when we eventually submitted the patch upstream. But, looks like I did a
blind rebasing instead of using IRQ_MOVE_PCNTXT in hpet MSI code.

Below patch fixes this. i.e., revert commit 932775a4ab622e3c99bd59f14cc
and add PCNTXT to HPET MSI setup. Also removes copying of desc->affinity
in generic code as set_affinity routines are doing it internally.

Reported-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Li Shaohua" <shaohua.li@intel.com>
Cc: Gary Hade <garyhade@us.ibm.com>
Cc: "lcm@us.ibm.com" <lcm@us.ibm.com>
Cc: suresh.b.siddha@intel.com
LKML-Reference: <20090413222058.GB8211@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7e2e7dd..2734eca 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -109,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	spin_lock_irqsave(&desc->lock, flags);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		cpumask_copy(desc->affinity, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT)
 		desc->chip->set_affinity(irq, cpumask);
-	} else {
+	else {
 		desc->status |= IRQ_MOVE_PENDING;
 		cpumask_copy(desc->pending_mask, cpumask);
 	}
-- 
cgit v1.1


From 297dbf50d7ab0539cf9cf7f2a66918665a18e45e Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 15 Apr 2009 10:37:04 +0530
Subject: swap: Remove code handling bio_alloc failure with __GFP_WAIT

Remove code handling bio_alloc failure with __GFP_WAIT.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/power/swap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 505f319..8ba052c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -64,8 +64,6 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	if (!bio)
-		return -ENOMEM;
 	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = resume_bdev;
 	bio->bi_end_io = end_swap_bio_read;
-- 
cgit v1.1


From 5b1d07ed0e5b2707f786957c7a40eb2f399c84a8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 15 Apr 2009 19:35:01 +0100
Subject: RCU: Don't try and predeclare inline funcs as it upsets some versions
 of gcc

Don't try and predeclare inline funcs like this:

	static inline void wait_migrated_callbacks(void)
	...
	static void _rcu_barrier(enum rcu_barrier type)
	{
		...
		wait_migrated_callbacks();
	}
	...
	static inline void wait_migrated_callbacks(void)
	{
		wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
	}

as it upsets some versions of gcc under some circumstances:

	kernel/rcupdate.c: In function `_rcu_barrier':
	kernel/rcupdate.c:125: sorry, unimplemented: inlining failed in call to 'wait_migrated_callbacks': function body not available
	kernel/rcupdate.c:152: sorry, unimplemented: called from here

This can be dealt with by simply putting the static variables (rcu_migrate_*)
at the top, and moving the implementation of the function up so that it
replaces its forward declaration.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcupdate.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2c7b845..a967c9f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -58,6 +58,10 @@ static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 int rcu_scheduler_active __read_mostly;
 
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
+
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
@@ -122,7 +126,10 @@ static void rcu_barrier_func(void *type)
 	}
 }
 
-static inline void wait_migrated_callbacks(void);
+static inline void wait_migrated_callbacks(void)
+{
+	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+}
 
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
@@ -179,21 +186,12 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
-static struct rcu_head rcu_migrate_head[3];
-static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
-
 static void rcu_migrate_callback(struct rcu_head *notused)
 {
 	if (atomic_dec_and_test(&rcu_migrate_type_count))
 		wake_up(&rcu_migrate_wq);
 }
 
-static inline void wait_migrated_callbacks(void)
-{
-	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
-}
-
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		unsigned long action, void *hcpu)
 {
-- 
cgit v1.1


From 381512cf3d27f63f7a45b1bbe7d2d609c2ea3b74 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 14 Apr 2009 09:09:36 +0530
Subject: sched: Avoid printing sched_group::__cpu_power for default case

Commit 46e0bb9c12f4 ("sched: Print sched_group::__cpu_power
in sched_domain_debug") produces a messy dmesg output while
attempting to print the sched_group::__cpu_power for each
group in the sched_domain hierarchy.

Fix this by avoid printing the __cpu_power for default cases.
(i.e, __cpu_power == SCHED_LOAD_SCALE).

[ Impact: reduce syslog clutter ]

Reported-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Fixed-by: Tony Luck <tony.luck@intel.com>
Cc: a.p.zijlstra@chello.nl
LKML-Reference: <20090414033936.GA534@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e90e70e..b902e58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7367,8 +7367,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-		printk(KERN_CONT " %s (__cpu_power = %d)", str,
-						group->__cpu_power);
+
+		printk(KERN_CONT " %s", str);
+		if (group->__cpu_power != SCHED_LOAD_SCALE) {
+			printk(KERN_CONT " (__cpu_power = %d)",
+				group->__cpu_power);
+		}
 
 		group = group->next;
 	} while (group != sd->groups);
-- 
cgit v1.1


From 79d381c9f2354b594dcab9b04dfcc0debf7294fe Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Thu, 16 Apr 2009 19:30:18 -0400
Subject: kernel/softirq.c: fix sparse warning

Fix sparse warning in kernel/softirq.c.

  warning: do-while statement is not a compound statement

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
LKML-Reference: <BD79186B4FD85F4B8E60E381CAEE1909015F9033@mi8nycmail19.Mi8.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2fecefa..b525dd3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -472,9 +472,9 @@ void tasklet_kill(struct tasklet_struct *t)
 		printk("Attempt to kill tasklet from interrupt\n");
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
-		do
+		do {
 			yield();
-		while (test_bit(TASKLET_STATE_SCHED, &t->state));
+		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
-- 
cgit v1.1


From c8a250058656495be02c00de61e26b017c86ef00 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Apr 2009 09:40:49 +0200
Subject: lockdep: more robust lockdep_map init sequence

Steven Rostedt reported:

> OK, I think I figured this bug out. This is a lockdep issue with respect
> to tracepoints.
>
> The trace points in lockdep are called all the time. Outside the lockdep
> logic. But if lockdep were to trigger an error / warning (which this run
> did) we might be in trouble. For new locks, like the dentry->d_lock, that
> are created, they will not get a name:
>
> void lockdep_init_map(struct lockdep_map *lock, const char *name,
>                       struct lock_class_key *key, int subclass)
> {
>         if (unlikely(!debug_locks))
>                 return;
>
> When a problem is found by lockdep, debug_locks becomes false. Thus we
> stop allocating names for locks. This dentry->d_lock I had, now has no
> name. Worse yet, I have CONFIG_DEBUG_VM set, that scrambles non
> initialized memory. Thus, when the trace point was hit, it had junk for
> the lock->name, and the machine crashed.

Ah, nice catch. I think we should put at least the name in regardless.

Ensure we at least initialize the trivial entries of the depmap so that
they can be relied upon, even when lockdep itself decided to pack up and
go home.

[ Impact: fix lock tracing after lockdep warnings. ]

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1239954049.23397.4156.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b0f0118..accb40c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2490,13 +2490,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
-	if (unlikely(!debug_locks))
+	lock->class_cache = NULL;
+#ifdef CONFIG_LOCK_STAT
+	lock->cpu = raw_smp_processor_id();
+#endif
+
+	if (DEBUG_LOCKS_WARN_ON(!name)) {
+		lock->name = "NULL";
 		return;
+	}
+
+	lock->name = name;
 
 	if (DEBUG_LOCKS_WARN_ON(!key))
 		return;
-	if (DEBUG_LOCKS_WARN_ON(!name))
-		return;
 	/*
 	 * Sanity check, the lock-class key must be persistent:
 	 */
@@ -2505,12 +2512,11 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		DEBUG_LOCKS_WARN_ON(1);
 		return;
 	}
-	lock->name = name;
 	lock->key = key;
-	lock->class_cache = NULL;
-#ifdef CONFIG_LOCK_STAT
-	lock->cpu = raw_smp_processor_id();
-#endif
+
+	if (unlikely(!debug_locks))
+		return;
+
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
-- 
cgit v1.1


From ff54250a0ebab7f90a5f848a0ba63f999830c872 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 18 Apr 2009 21:44:24 -0700
Subject: Remove 'recurse into child resources' logic from
 'reserve_region_with_split()'

This function is not actually used right now, since the original use
case for it was done with insert_resource_expand_to_fit() instead.

However, we now have another usage case that wants to basically do a
"reserve IO resource, splitting around existing resources", however that
one doesn't actually want the "recurse into the conflicting resource"
logic at all.

And since recursing into the conflicting resource was the most complex
part, and isn't wanted, just remove it.  Maybe we'll some day want both
versions, but we can just resurrect the logic then.

Tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 46 ++++++++++++----------------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index fd5d7d5..ac5f3a3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -533,43 +533,21 @@ static void __init __reserve_region_with_split(struct resource *root,
 	res->end = end;
 	res->flags = IORESOURCE_BUSY;
 
-	for (;;) {
-		conflict = __request_resource(parent, res);
-		if (!conflict)
-			break;
-		if (conflict != parent) {
-			parent = conflict;
-			if (!(conflict->flags & IORESOURCE_BUSY))
-				continue;
-		}
-
-		/* Uhhuh, that didn't work out.. */
-		kfree(res);
-		res = NULL;
-		break;
-	}
-
-	if (!res) {
-		/* failed, split and try again */
-
-		/* conflict covered whole area */
-		if (conflict->start <= start && conflict->end >= end)
-			return;
+	conflict = __request_resource(parent, res);
+	if (!conflict)
+		return;
 
-		if (conflict->start > start)
-			__reserve_region_with_split(root, start, conflict->start-1, name);
-		if (!(conflict->flags & IORESOURCE_BUSY)) {
-			resource_size_t common_start, common_end;
+	/* failed, split and try again */
+	kfree(res);
 
-			common_start = max(conflict->start, start);
-			common_end = min(conflict->end, end);
-			if (common_start < common_end)
-				__reserve_region_with_split(root, common_start, common_end, name);
-		}
-		if (conflict->end < end)
-			__reserve_region_with_split(root, conflict->end+1, end, name);
-	}
+	/* conflict covered whole area */
+	if (conflict->start <= start && conflict->end >= end)
+		return;
 
+	if (conflict->start > start)
+		__reserve_region_with_split(root, start, conflict->start-1, name);
+	if (conflict->end < end)
+		__reserve_region_with_split(root, conflict->end+1, end, name);
 }
 
 void __init reserve_region_with_split(struct resource *root,
-- 
cgit v1.1


From 6a7c7eaf71b636f197d73b381a2ab729ebdcfb2e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 19 Apr 2009 20:08:42 +0200
Subject: PM/Suspend: Introduce two new platform callbacks to avoid breakage

Commit 900af0d973856d6feb6fc088c2d0d3fde57707d3 (PM: Change suspend
code ordering) changed the ordering of suspend code in such a way
that the platform .prepare() callback is now executed after the
device drivers' late suspend callbacks have run.  Unfortunately, this
turns out to break ARM platforms that need to talk via I2C to power
control devices during the .prepare() callback.

For this reason introduce two new platform suspend callbacks,
.prepare_late() and .wake(), that will be called just prior to
disabling non-boot CPUs and right after bringing them back on line,
respectively, and use them instead of .prepare() and .finish() for
ACPI suspend.  Make the PM core execute the .prepare() and .finish()
platform suspend callbacks where they were executed previously (that
is, right after calling the regular suspend methods provided by
device drivers and right before executing their regular resume
methods, respectively).

It is not necessary to make analogous changes to the hibernation
code and data structures at the moment, because they are only used
by ACPI platforms.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Len Brown <len.brown@intel.com>
---
 kernel/power/main.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index f172f41..f99ed6a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -291,20 +291,26 @@ static int suspend_enter(suspend_state_t state)
 
 	device_pm_lock();
 
+	if (suspend_ops->prepare) {
+		error = suspend_ops->prepare();
+		if (error)
+			goto Done;
+	}
+
 	error = device_power_down(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
-		goto Done;
+		goto Platfrom_finish;
 	}
 
-	if (suspend_ops->prepare) {
-		error = suspend_ops->prepare();
+	if (suspend_ops->prepare_late) {
+		error = suspend_ops->prepare_late();
 		if (error)
 			goto Power_up_devices;
 	}
 
 	if (suspend_test(TEST_PLATFORM))
-		goto Platfrom_finish;
+		goto Platform_wake;
 
 	error = disable_nonboot_cpus();
 	if (error || suspend_test(TEST_CPUS))
@@ -326,13 +332,17 @@ static int suspend_enter(suspend_state_t state)
  Enable_cpus:
 	enable_nonboot_cpus();
 
- Platfrom_finish:
-	if (suspend_ops->finish)
-		suspend_ops->finish();
+ Platform_wake:
+	if (suspend_ops->wake)
+		suspend_ops->wake();
 
  Power_up_devices:
 	device_power_up(PMSG_RESUME);
 
+ Platfrom_finish:
+	if (suspend_ops->finish)
+		suspend_ops->finish();
+
  Done:
 	device_pm_unlock();
 
-- 
cgit v1.1


From 24b6f16ecf37f918a1934d590e9e71c100d6388f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:25:41 -0400
Subject: No need for crossing to mountpoint in audit_tag_tree()

is_under() will DTRT anyway.  And yes, is_subdir() behaviour
is intentional.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 917ab95..6e73517 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -734,9 +734,6 @@ int audit_tag_tree(char *old, char *new)
 	dentry = dget(path.dentry);
 	path_put(&path);
 
-	if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
-		follow_up(&mnt, &dentry);
-
 	list_add_tail(&list, &tagged->mnt_list);
 
 	mutex_lock(&audit_filter_mutex);
-- 
cgit v1.1


From 8e19608e8b5c001e4a66ce482edc474f05fb7355 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 21 Apr 2009 12:24:00 -0700
Subject: clocksource: pass clocksource to read() callback

Pass clocksource pointer to the read() callback for clocksources.  This
allows us to share the callback between multiple instances.

[hugh@veritas.com: fix powerpc build of clocksource pass clocksource mods]
[akpm@linux-foundation.org: cleanup]
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/clocksource.c | 8 ++++----
 kernel/time/jiffies.c     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c46c931..ecfd7b5 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
 
 	resumed = test_and_clear_bit(0, &watchdog_resumed);
 
-	wdnow = watchdog->read();
+	wdnow = watchdog->read(watchdog);
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
-		csnow = cs->read();
+		csnow = cs->read(cs);
 
 		if (unlikely(resumed)) {
 			cs->wd_last = csnow;
@@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 
 		list_add(&cs->wd_list, &watchdog_list);
 		if (!started && watchdog) {
-			watchdog_last = watchdog->read();
+			watchdog_last = watchdog->read(watchdog);
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 			add_timer_on(&watchdog_timer,
 				     cpumask_first(cpu_online_mask));
@@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
 			/* Start if list is not empty */
 			if (!list_empty(&watchdog_list)) {
-				watchdog_last = watchdog->read();
+				watchdog_last = watchdog->read(watchdog);
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
 				add_timer_on(&watchdog_timer,
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f1975..c3f6c30 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
  */
 #define JIFFIES_SHIFT	8
 
-static cycle_t jiffies_read(void)
+static cycle_t jiffies_read(struct clocksource *cs)
 {
 	return (cycle_t) jiffies;
 }
-- 
cgit v1.1


From 4614e6adafa2c5e6c3a9c245af2807fa7bc5117a Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 21 Apr 2009 12:24:02 -0700
Subject: clocksource: add enable() and disable() callbacks

Add enable() and disable() callbacks for clocksources.

This allows us to put unused clocksources in power save mode.  The
functions clocksource_enable() and clocksource_disable() wrap the
callbacks and are inserted in the timekeeping code to enable before use
and disable after switching to a new clocksource.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/timekeeping.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 900f1b6..687dff4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(do_settimeofday);
  */
 static void change_clocksource(void)
 {
-	struct clocksource *new;
+	struct clocksource *new, *old;
 
 	new = clocksource_get_next();
 
@@ -191,11 +191,16 @@ static void change_clocksource(void)
 
 	clocksource_forward_now();
 
-	new->raw_time = clock->raw_time;
+	if (clocksource_enable(new))
+		return;
 
+	new->raw_time = clock->raw_time;
+	old = clock;
 	clock = new;
+	clocksource_disable(old);
+
 	clock->cycle_last = 0;
-	clock->cycle_last = clocksource_read(new);
+	clock->cycle_last = clocksource_read(clock);
 	clock->error = 0;
 	clock->xtime_nsec = 0;
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -292,6 +297,7 @@ void __init timekeeping_init(void)
 	ntp_init();
 
 	clock = clocksource_get_next();
+	clocksource_enable(clock);
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
 	clock->cycle_last = clocksource_read(clock);
 
-- 
cgit v1.1


From b48ccb095a0c9257241261ec2bd1cbb1bdabc48b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Apr 2009 09:36:52 +0200
Subject: locking: clarify kernel-taint warning message

Andi Kleen reported this message triggering on non-lockdep kernels:

   Disabling lockdep due to kernel taint

Clarify the message to say 'lock debugging' - debug_locks_off()
turns off all things lock debugging, not just lockdep.

[ Impact: change kernel warning message text ]

Reported-by: Andi Kleen <andi@firstfloor.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 934fb37..3dcaa16 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -221,7 +221,7 @@ void add_taint(unsigned flag)
 	 * post-warning case.
 	 */
 	if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
-		printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
+		printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
 
 	set_bit(flag, &tainted_mask);
 }
-- 
cgit v1.1


From 418df63c2d94f238ac7e1d1d53be35dd6b7a7252 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Wed, 22 Apr 2009 12:01:49 +0100
Subject: Delete slow-work timers properly

Slow-work appears to delete its timer as soon as the first user
unregisters, even though other users could be active.  At the same time, it
never seems to delete slow_work_oom_timer.  Arrange for both to happen in
the shutdown path.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/slow-work.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index cf2bc01..b28d191 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -609,14 +609,14 @@ void slow_work_unregister_user(void)
 	if (slow_work_user_count == 0) {
 		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
 		slow_work_threads_should_exit = true;
+		del_timer_sync(&slow_work_cull_timer);
+		del_timer_sync(&slow_work_oom_timer);
 		wake_up_all(&slow_work_thread_wq);
 		wait_for_completion(&slow_work_last_thread_exited);
 		printk(KERN_NOTICE "Slow work thread pool:"
 		       " Shut down complete\n");
 	}
 
-	del_timer_sync(&slow_work_cull_timer);
-
 	mutex_unlock(&slow_work_user_lock);
 }
 EXPORT_SYMBOL(slow_work_unregister_user);
-- 
cgit v1.1


From 0c8454f56623505a99463405fd7d5664adfbb094 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 25 Apr 2009 00:16:06 +0200
Subject: PM/Hibernate: Fix waiting for image device to appear on resume

Commit c751085943362143f84346d274e0011419c84202 ("PM/Hibernate: Wait for
SCSI devices scan to complete during resume") added a call to
scsi_complete_async_scans() to software_resume(), so that it waited for
the SCSI scanning to complete, but the call was added at a wrong place.

Namely, it should have been added after wait_for_device_probe(), which
is called only if the image partition hasn't been specified yet.  Also,
it's reasonable to check if the image partition is present and only wait
for the device probing and SCSI scanning to complete if it is not the
case.

Additionally, since noresume is checked right at the beginning of
software_resume() and the function returns immediately if it's set, it
doesn't make sense to check it once again later.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 51 +++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0854770..e71ca9c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -646,13 +646,6 @@ static int software_resume(void)
 		return 0;
 
 	/*
-	 * We can't depend on SCSI devices being available after loading one of
-	 * their modules if scsi_complete_async_scans() is not called and the
-	 * resume device usually is a SCSI one.
-	 */
-	scsi_complete_async_scans();
-
-	/*
 	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
 	 * is configured into the kernel. Since the regular hibernate
 	 * trigger path is via sysfs which takes a buffer mutex before
@@ -663,32 +656,42 @@ static int software_resume(void)
 	 * here to avoid lockdep complaining.
 	 */
 	mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+
+	if (swsusp_resume_device)
+		goto Check_image;
+
+	if (!strlen(resume_file)) {
+		error = -ENOENT;
+		goto Unlock;
+	}
+
+	pr_debug("PM: Checking image partition %s\n", resume_file);
+
+	/* Check if the device is there */
+	swsusp_resume_device = name_to_dev_t(resume_file);
 	if (!swsusp_resume_device) {
-		if (!strlen(resume_file)) {
-			mutex_unlock(&pm_mutex);
-			return -ENOENT;
-		}
 		/*
 		 * Some device discovery might still be in progress; we need
 		 * to wait for this to finish.
 		 */
 		wait_for_device_probe();
+		/*
+		 * We can't depend on SCSI devices being available after loading
+		 * one of their modules until scsi_complete_async_scans() is
+		 * called and the resume device usually is a SCSI one.
+		 */
+		scsi_complete_async_scans();
+
 		swsusp_resume_device = name_to_dev_t(resume_file);
-		pr_debug("PM: Resume from partition %s\n", resume_file);
-	} else {
-		pr_debug("PM: Resume from partition %d:%d\n",
-				MAJOR(swsusp_resume_device),
-				MINOR(swsusp_resume_device));
+		if (!swsusp_resume_device) {
+			error = -ENODEV;
+			goto Unlock;
+		}
 	}
 
-	if (noresume) {
-		/**
-		 * FIXME: If noresume is specified, we need to find the
-		 * partition and reset it back to normal swap space.
-		 */
-		mutex_unlock(&pm_mutex);
-		return 0;
-	}
+ Check_image:
+	pr_debug("PM: Resume from partition %d:%d\n",
+		MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
 
 	pr_debug("PM: Checking hibernation image.\n");
 	error = swsusp_check();
-- 
cgit v1.1


From cad81bc2529ab8c62b6fdc83a1c0c7f4a87209eb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 27 Apr 2009 01:41:34 +0200
Subject: ptrace: ptrace_attach: fix the usage of ->cred_exec_mutex

ptrace_attach() needs task->cred_exec_mutex, not current->cred_exec_mutex.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/ptrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dfcd83c..0692ab5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -188,7 +188,7 @@ int ptrace_attach(struct task_struct *task)
 	/* Protect exec's credential calculations against our interference;
 	 * SUID, SGID and LSM creds get determined differently under ptrace.
 	 */
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&task->cred_exec_mutex);
 	if (retval  < 0)
 		goto out;
 
@@ -232,7 +232,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&task->cred_exec_mutex);
 out:
 	return retval;
 }
-- 
cgit v1.1


From 7267fa6819467669f5cc2ba81a615dcc88158b4b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:16:21 -0400
Subject: tracing: fix ref count in splice pages

The pages allocated for the splice binary buffer did not initialize
the ref count correctly. This caused pages not to be freed and causes
a drastic memory leak.

Thanks to logdev I was able to trace the tracer to find where the leak
was.

[ Impact: stop memory leak when using splice ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ce5dc6..a884c09 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3448,6 +3448,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if (!ref)
 			break;
 
+		ref->ref = 1;
 		ref->buffer = info->tr->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer);
 		if (!ref->page) {
-- 
cgit v1.1


From f5f293a4e3d0a0c52cec31de6762c95050156516 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Apr 2009 14:44:49 +0200
Subject: sched: account system time properly

Andrew Gallatin reported that IRQ and SOFTIRQ times were
sometime not reported correctly on recent kernels, and even
bisected to commit 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4
([PATCH] fix scaled & unscaled cputime accounting) as the first
bad commit.

Further analysis pointed that commit
79741dd35713ff4f6fd0eafd59fa94e8a4ba922d ([PATCH] idle cputime
accounting) was the real cause of the problem.

account_process_tick() was not taking into account timer IRQ
interrupting the idle task servicing a hard or soft irq.

On mostly idle cpu, irqs were thus not accounted and top or
mpstat could tell user/admin that cpu was 100 % idle, 0.00 %
irq, 0.00 % softirq, while it was not.

[ Impact: fix occasionally incorrect CPU statistics in top/mpstat ]

Reported-by: Andrew Gallatin <gallatin@myri.com>
Re-reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: rick.jones2@hp.com
Cc: brice@myri.com
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
LKML-Reference: <49F84BC1.7080602@cosmosbay.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b902e58..26efa47 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4732,7 +4732,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 
 	if (user_tick)
 		account_user_time(p, one_jiffy, one_jiffy_scaled);
-	else if (p != rq->idle)
+	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
 				    one_jiffy_scaled);
 	else
-- 
cgit v1.1


From 6e85c5ba73c07b990798087e9b858c065db2b234 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Wed, 29 Apr 2009 19:14:32 -0400
Subject: kernel/posix-cpu-timers.c: fix sparse warning

Sparse reports the following in kernel/posix-cpu-timers.c:

  warning: symbol 'firing' shadows an earlier one

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Subrata Modak <subrata@linux.vnet.ibm.com>
LKML-Reference: <BD79186B4FD85F4B8E60E381CAEE1909016C1AFE@mi8nycmail19.Mi8.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c9dcf98..bece7c0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1420,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * timer call will interfere.
 	 */
 	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
-		int firing;
+		int cpu_firing;
+
 		spin_lock(&timer->it_lock);
 		list_del_init(&timer->it.cpu.entry);
-		firing = timer->it.cpu.firing;
+		cpu_firing = timer->it.cpu.firing;
 		timer->it.cpu.firing = 0;
 		/*
 		 * The firing flag is -1 if we collided with a reset
 		 * of the timer, which already reported this
 		 * almost-firing as an overrun.  So don't generate an event.
 		 */
-		if (likely(firing >= 0)) {
+		if (likely(cpu_firing >= 0))
 			cpu_timer_fire(timer);
-		}
 		spin_unlock(&timer->it_lock);
 	}
 }
-- 
cgit v1.1


From d7226fb6ec5d4f325e4e7fd905894e2ea3eb3ae0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 15:16:04 +0200
Subject: Revert "genirq: assert that irq handlers are indeed running in
 hardirq context"

This reverts commit 044d408409cc4e1bc75c886e27ca85c270db104c.

The commit added a warning when handle_IRQ_event() is called outside
of hard interrupt context. This breaks the generic tasklet based
interrupt resend mechanism which is used when the hardware has no way
to retrigger the interrupt. So we get a warning for a use case which
is correct and worked for years. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d82142b..26e0875 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -363,8 +363,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
-	WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
-
 	if (!(action->flags & IRQF_DISABLED))
 		local_irq_enable_in_hardirq();
 
-- 
cgit v1.1


From 74a03b69d1b5ce00a568e142ca97e76b7f5239c6 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 1 May 2009 13:10:25 -0700
Subject: clockevents: prevent endless loop in tick_handle_periodic()

tick_handle_periodic() can lock up hard when a one shot clock event
device is used in combination with jiffies clocksource.

Avoid an endless loop issue by requiring that a highres valid
clocksource be installed before we call tick_periodic() in a loop when
using ONESHOT mode. The result is we will only increment jiffies once
per interrupt until a continuous hardware clocksource is available.

Without this, we can run into a endless loop, where each cycle through
the loop, jiffies is updated which increments time by tick_period or
more (due to clock steering), which can cause the event programming to
think the next event was before the newly incremented time and fail
causing tick_periodic() to be called again and the whole process loops
forever.

[ Impact: prevent hard lock up ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 kernel/time/tick-common.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 21a5ca8..83c4417 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
 	for (;;) {
 		if (!clockevents_program_event(dev, next, ktime_get()))
 			return;
-		tick_periodic(cpu);
+		/*
+		 * Have to be careful here. If we're in oneshot mode,
+		 * before we call tick_periodic() in a loop, we need
+		 * to be sure we're using a real hardware clocksource.
+		 * Otherwise we could get trapped in an infinite
+		 * loop, as the tick_periodic() increments jiffies,
+		 * when then will increment time, posibly causing
+		 * the loop to trigger again and again.
+		 */
+		if (timekeeping_valid_for_hres())
+			tick_periodic(cpu);
 		next = ktime_add(next, tick_period);
 	}
 }
-- 
cgit v1.1


From 9e4a5bda89034502fb144331e71a0efdfd5fae97 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Thu, 30 Apr 2009 15:08:57 -0700
Subject: mm: prevent divide error for small values of vm_dirty_bytes

Avoid setting less than two pages for vm_dirty_bytes: this is necessary to
avoid potential division by 0 (like the following) in get_dirty_limits().

[   49.951610] divide error: 0000 [#1] PREEMPT SMP
[   49.952195] last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/host0/target0:0:0/0:0:0:0/block/sda/uevent
[   49.952195] CPU 1
[   49.952195] Modules linked in: pcspkr
[   49.952195] Pid: 3064, comm: dd Not tainted 2.6.30-rc3 #1
[   49.952195] RIP: 0010:[<ffffffff802d39a9>]  [<ffffffff802d39a9>] get_dirty_limits+0xe9/0x2c0
[   49.952195] RSP: 0018:ffff88001de03a98  EFLAGS: 00010202
[   49.952195] RAX: 00000000000000c0 RBX: ffff88001de03b80 RCX: 28f5c28f5c28f5c3
[   49.952195] RDX: 0000000000000000 RSI: 00000000000000c0 RDI: 0000000000000000
[   49.952195] RBP: ffff88001de03ae8 R08: 0000000000000000 R09: 0000000000000000
[   49.952195] R10: ffff88001ddda9a0 R11: 0000000000000001 R12: 0000000000000001
[   49.952195] R13: ffff88001fbc8218 R14: ffff88001de03b70 R15: ffff88001de03b78
[   49.952195] FS:  00007fe9a435b6f0(0000) GS:ffff8800025d9000(0000) knlGS:0000000000000000
[   49.952195] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   49.952195] CR2: 00007fe9a39ab000 CR3: 000000001de38000 CR4: 00000000000006e0
[   49.952195] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   49.952195] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   49.952195] Process dd (pid: 3064, threadinfo ffff88001de02000, task ffff88001ddda250)
[   49.952195] Stack:
[   49.952195]  ffff88001fa0de00 ffff88001f2dbd70 ffff88001f9fe800 000080b900000000
[   49.952195]  00000000000000c0 ffff8800027a6100 0000000000000400 ffff88001fbc8218
[   49.952195]  0000000000000000 0000000000000600 ffff88001de03bb8 ffffffff802d3ed7
[   49.952195] Call Trace:
[   49.952195]  [<ffffffff802d3ed7>] balance_dirty_pages_ratelimited_nr+0x1d7/0x3f0
[   49.952195]  [<ffffffff80368f8e>] ? ext3_writeback_write_end+0x9e/0x120
[   49.952195]  [<ffffffff802cc7df>] generic_file_buffered_write+0x12f/0x330
[   49.952195]  [<ffffffff802cce8d>] __generic_file_aio_write_nolock+0x26d/0x460
[   49.952195]  [<ffffffff802cda32>] ? generic_file_aio_write+0x52/0xd0
[   49.952195]  [<ffffffff802cda49>] generic_file_aio_write+0x69/0xd0
[   49.952195]  [<ffffffff80365fa6>] ext3_file_write+0x26/0xc0
[   49.952195]  [<ffffffff803034d1>] do_sync_write+0xf1/0x140
[   49.952195]  [<ffffffff80290d1a>] ? get_lock_stats+0x2a/0x60
[   49.952195]  [<ffffffff80280730>] ? autoremove_wake_function+0x0/0x40
[   49.952195]  [<ffffffff8030411b>] vfs_write+0xcb/0x190
[   49.952195]  [<ffffffff803042d0>] sys_write+0x50/0x90
[   49.952195]  [<ffffffff8022ff6b>] system_call_fastpath+0x16/0x1b
[   49.952195] Code: 00 00 00 2b 05 09 1c 17 01 48 89 c6 49 0f af f4 48 c1 ee 02 48 89 f0 48 f7 e1 48 89 d6 31 d2 48 c1 ee 02 48 0f af 75 d0 48 89 f0 <48> f7 f7 41 8b 95 ac 01 00 00 48 89 c7 49 0f af d4 48 c1 ea 02
[   49.952195] RIP  [<ffffffff802d39a9>] get_dirty_limits+0xe9/0x2c0
[   49.952195]  RSP <ffff88001de03a98>
[   50.096523] ---[ end trace 008d7aa02f244d7b ]---

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7d..ea78fa1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -103,6 +103,9 @@ static unsigned long one_ul = 1;
 static int one_hundred = 100;
 static int one_thousand = 1000;
 
+/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
+
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
@@ -1006,7 +1009,7 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &dirty_bytes_handler,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &one_ul,
+		.extra1		= &dirty_bytes_min,
 	},
 	{
 		.procname	= "dirty_writeback_centisecs",
-- 
cgit v1.1


From 381a80e6df396eaabef2c00f85974a4579ac1c70 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 6 May 2009 16:02:50 -0700
Subject: inotify: use GFP_NOFS in kernel_event() to work around a lockdep
 false-positive

There is what we believe to be a false positive reported by lockdep.

inotify_inode_queue_event() => take inotify_mutex => kernel_event() =>
kmalloc() => SLOB => alloc_pages_node() => page reclaim => slab reclaim =>
dcache reclaim => inotify_inode_is_dead => take inotify_mutex => deadlock

The plan is to fix this via lockdep annotation, but that is proving to be
quite involved.

The patch flips the allocation over to GFP_NFS to shut the warning up, for
the 2.6.30 release.

Hopefully we will fix this for real in 2.6.31.  I'll queue a patch in -mm
to switch it back to GFP_KERNEL so we don't forget.

  =================================
  [ INFO: inconsistent lock state ]
  2.6.30-rc2-next-20090417 #203
  ---------------------------------
  inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage.
  kswapd0/380 [HC0[0]:SC0[0]:HE1:SE1] takes:
   (&inode->inotify_mutex){+.+.?.}, at: [<ffffffff8112f1b5>] inotify_inode_is_dead+0x35/0xb0
  {RECLAIM_FS-ON-W} state was registered at:
    [<ffffffff81079188>] mark_held_locks+0x68/0x90
    [<ffffffff810792a5>] lockdep_trace_alloc+0xf5/0x100
    [<ffffffff810f5261>] __kmalloc_node+0x31/0x1e0
    [<ffffffff81130652>] kernel_event+0xe2/0x190
    [<ffffffff81130826>] inotify_dev_queue_event+0x126/0x230
    [<ffffffff8112f096>] inotify_inode_queue_event+0xc6/0x110
    [<ffffffff8110444d>] vfs_create+0xcd/0x140
    [<ffffffff8110825d>] do_filp_open+0x88d/0xa20
    [<ffffffff810f6b68>] do_sys_open+0x98/0x140
    [<ffffffff810f6c50>] sys_open+0x20/0x30
    [<ffffffff8100c272>] system_call_fastpath+0x16/0x1b
    [<ffffffffffffffff>] 0xffffffffffffffff
  irq event stamp: 690455
  hardirqs last  enabled at (690455): [<ffffffff81564fe4>] _spin_unlock_irqrestore+0x44/0x80
  hardirqs last disabled at (690454): [<ffffffff81565372>] _spin_lock_irqsave+0x32/0xa0
  softirqs last  enabled at (690178): [<ffffffff81052282>] __do_softirq+0x202/0x220
  softirqs last disabled at (690157): [<ffffffff8100d50c>] call_softirq+0x1c/0x50

  other info that might help us debug this:
  2 locks held by kswapd0/380:
   #0:  (shrinker_rwsem){++++..}, at: [<ffffffff810d0bd7>] shrink_slab+0x37/0x180
   #1:  (&type->s_umount_key#17){++++..}, at: [<ffffffff8110cfbf>] shrink_dcache_memory+0x11f/0x1e0

  stack backtrace:
  Pid: 380, comm: kswapd0 Not tainted 2.6.30-rc2-next-20090417 #203
  Call Trace:
   [<ffffffff810789ef>] print_usage_bug+0x19f/0x200
   [<ffffffff81018bff>] ? save_stack_trace+0x2f/0x50
   [<ffffffff81078f0b>] mark_lock+0x4bb/0x6d0
   [<ffffffff810799e0>] ? check_usage_forwards+0x0/0xc0
   [<ffffffff8107b142>] __lock_acquire+0xc62/0x1ae0
   [<ffffffff810f478c>] ? slob_free+0x10c/0x370
   [<ffffffff8107c0a1>] lock_acquire+0xe1/0x120
   [<ffffffff8112f1b5>] ? inotify_inode_is_dead+0x35/0xb0
   [<ffffffff81562d43>] mutex_lock_nested+0x63/0x420
   [<ffffffff8112f1b5>] ? inotify_inode_is_dead+0x35/0xb0
   [<ffffffff8112f1b5>] ? inotify_inode_is_dead+0x35/0xb0
   [<ffffffff81012fe9>] ? sched_clock+0x9/0x10
   [<ffffffff81077165>] ? lock_release_holdtime+0x35/0x1c0
   [<ffffffff8112f1b5>] inotify_inode_is_dead+0x35/0xb0
   [<ffffffff8110c9dc>] dentry_iput+0xbc/0xe0
   [<ffffffff8110cb23>] d_kill+0x33/0x60
   [<ffffffff8110ce23>] __shrink_dcache_sb+0x2d3/0x350
   [<ffffffff8110cffa>] shrink_dcache_memory+0x15a/0x1e0
   [<ffffffff810d0cc5>] shrink_slab+0x125/0x180
   [<ffffffff810d1540>] kswapd+0x560/0x7a0
   [<ffffffff810ce160>] ? isolate_pages_global+0x0/0x2c0
   [<ffffffff81065a30>] ? autoremove_wake_function+0x0/0x40
   [<ffffffff8107953d>] ? trace_hardirqs_on+0xd/0x10
   [<ffffffff810d0fe0>] ? kswapd+0x0/0x7a0
   [<ffffffff8106555b>] kthread+0x5b/0xa0
   [<ffffffff8100d40a>] child_rip+0xa/0x20
   [<ffffffff8100cdd0>] ? restore_args+0x0/0x30
   [<ffffffff81065500>] ? kthread+0x0/0xa0
   [<ffffffff8100d400>] ? child_rip+0x0/0x20

[eparis@redhat.com: fix audit too]
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditfilter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6fe71f..713098e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1028,7 +1028,7 @@ static void audit_update_watch(struct audit_parent *parent,
 
 		if (audit_enabled) {
 			struct audit_buffer *ab;
-			ab = audit_log_start(NULL, GFP_KERNEL,
+			ab = audit_log_start(NULL, GFP_NOFS,
 				AUDIT_CONFIG_CHANGE);
 			audit_log_format(ab, "auid=%u ses=%u",
 				audit_get_loginuid(current),
@@ -1067,7 +1067,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 			e = container_of(r, struct audit_entry, rule);
 			if (audit_enabled) {
 				struct audit_buffer *ab;
-				ab = audit_log_start(NULL, GFP_KERNEL,
+				ab = audit_log_start(NULL, GFP_NOFS,
 					AUDIT_CONFIG_CHANGE);
 				audit_log_format(ab, "auid=%u ses=%u",
 					audit_get_loginuid(current),
-- 
cgit v1.1


From 57adc4d2dbf968fdbe516359688094eef4d46581 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 6 May 2009 16:02:53 -0700
Subject: Eliminate thousands of warnings with gcc 3.2 build

When building with gcc 3.2 I get thousands of warnings such as

include/linux/gfp.h: In function `allocflags_to_migratetype':
include/linux/gfp.h:105: warning: null format string

due to passing a NULL format string to warn_slowpath() in

#define __WARN()		warn_slowpath(__FILE__, __LINE__, NULL)

Split this case out into a separate call.  This also shrinks the kernel
slightly:

          text    data     bss     dec     hex filename
       4802274  707668  712704 6222646  5ef336 vmlinux
          text    data     bss     dec     hex filename
       4799027  703572  712704 6215303  5ed687 vmlinux

due to removeing one argument from the commonly-called __WARN().

[akpm@linux-foundation.org: reduce scope of `empty']
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 3dcaa16..874ecf1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -340,7 +340,7 @@ void oops_exit(void)
 }
 
 #ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath(const char *file, int line, const char *fmt, ...)
+void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
 {
 	va_list args;
 	char function[KSYM_SYMBOL_LEN];
@@ -356,7 +356,7 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	if (board)
 		printk(KERN_WARNING "Hardware name: %s\n", board);
 
-	if (fmt) {
+	if (*fmt) {
 		va_start(args, fmt);
 		vprintk(fmt, args);
 		va_end(args);
@@ -367,7 +367,14 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	print_oops_end_marker();
 	add_taint(TAINT_WARN);
 }
-EXPORT_SYMBOL(warn_slowpath);
+EXPORT_SYMBOL(warn_slowpath_fmt);
+
+void warn_slowpath_null(const char *file, int line)
+{
+	static const char *empty = "";
+	warn_slowpath_fmt(file, line, empty);
+}
+EXPORT_SYMBOL(warn_slowpath_null);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.1